<a href="https://colab.research.google.com/github/leahyye/DATA305-Project/blob/main/DATA305_Project_LeahYe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA305 - Project

* Student Name: Leah Ye
* Student ID: 300651931
* Course code: DATA 305


In [72]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential

Develop a fully functional model trained on the data provided in train.csv. If
a validation set is needed, it should be created as a subset of this training
data. Evaluate the model using test.csv, and report both the Area Under the
Receiver Operating Characteristic Curve (ROC AUC) and the Area Under the
Precision-Recall Curve (PR AUC) on the test set.

In [73]:
def set_seeds(seed):
  os.environ["PYTHONHASHSEED"] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)
  os.environ["TF_DETERMINISTIC_OPS"] = "1"

In [33]:
set_seeds(42)

In [74]:
# Load the data
from google.colab import files
import pandas as pd
#uploaded = files.upload()

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
print(f"Loaded {len(train_set)} rows")
print(f"Loaded {len(test_set)} rows")

Loaded 2352 rows
Loaded 3370 rows


In [75]:
def data_preprocessing(seed, train_path='train.csv', test_path='test.csv'):
    """
    Load and split data into train/val/test sets
    Returns: (train_set, val_set, test_set)
    """
    # Load data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    X_train = train['FASTA']
    y_train = train['label']

    # Split train into train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=seed,
        stratify=y_train
    )

    # Create TensorFlow Datasets
    train_set = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    train_set = train_set.shuffle(5000, seed=seed).batch(32).prefetch(1)

    val_set = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
    val_set = val_set.batch(32).prefetch(1)

    test_set = tf.data.Dataset.from_tensor_slices((test['FASTA'].values, test['label'].values))
    test_set = test_set.batch(32).prefetch(1)

    return train_set, val_set, test_set

In [76]:
def build_text_vectorizer(train_text, max_tokens=30, seq_length=50):
    """Create and adapt text vectorization layer"""
    text_vec_layer = TextVectorization(
        max_tokens=max_tokens,
        output_sequence_length=seq_length,
        split="character",
        pad_to_max_tokens=True
    )
    text_vec_layer.adapt(train_text)
    return text_vec_layer

In [49]:
class BiLSTMAttention(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def build(self, input_shape):
        # input_shape: (batch_size, time_steps, hidden_dim * 2)
        self.W = self.add_weight(
            name="att_weight",
            shape=(input_shape[-1], 1),  # hidden_dim * 2
            initializer="random_normal",
            trainable=True
        )
        self.b = self.add_weight(
            name="att_bias",
            shape=(1,),  # scalar bias
            initializer="zeros",
            trainable=True
        )

    def call(self, inputs):
        # inputs: (batch_size, time_steps, hidden_dim * 2)
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)  # (batch_size, time_steps, 1)
        attention_weights = tf.nn.softmax(score, axis=1)  # (batch_size, time_steps, 1)
        context_vector = tf.reduce_sum(inputs * attention_weights, axis=1)  # (batch_size, hidden_dim * 2)
        return context_vector

In [77]:
# USE THIS
class LNSimpleRNNCell(tf.keras.layers.Layer):
    def __init__(self, units, activation="tanh", **kwargs):
        super().__init__(**kwargs)
        self.state_size = units
        self.output_size = units
        self.simple_rnn_cell = tf.keras.layers.SimpleRNNCell(units,
                                                             activation=None)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.activation = tf.keras.activations.get(activation)

    def call(self, inputs, states):
        outputs, new_states = self.simple_rnn_cell(inputs, states)
        norm_outputs = self.activation(self.layer_norm(outputs))
        return norm_outputs, [norm_outputs]


In [62]:
# results: roc 0.817, pr = 0.54, ok training time
# might try again with the pooling layer and GRU this time (next code chunk)
def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        # Deep RNN with your custom cell (2 layers)
        tf.keras.layers.RNN(
            [LNSimpleRNNCell(128), LNSimpleRNNCell(128)],  # Stacked cells
            return_sequences=True
        ),

        # Bidirectional LSTM for comparison
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(64)
        ),

        # Simple classifier
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.GlobalAveragePooling1D(), # Add a pooling layer to reduce sequence to a single vector
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [78]:
# BEST MODEL SO FAR
# Test PR-AUC:     0.4980
# Test ROC-AUC:    0.8454
def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),

        tf.keras.layers.GRU(64, return_sequences=True),

        tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [57]:
# LSTM model takes a long time to train
# Test ROC-AUC:    0.8164
# Test PR-AUC:     0.3778
def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
        tf.keras.layers.LSTM(32, return_sequences=True, input_shape=[None, 5]),
        tf.keras.layers.GlobalAveragePooling1D(), # Add a pooling layer to reduce sequence to a single vector
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [58]:
# roc: 0.79, decent pr-auc
def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
            128,
            return_sequences=True,  # Pass sequences to next layer
            dropout=0.2,
            recurrent_dropout=0.2
        )),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),  # Stacked LSTM
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer='l2'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer="nadam",
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )

    return model

In [79]:
def train_model(model, train_set, val_set, callbacks, epochs=30):
    """Train model and return training history"""
    if callbacks is None:
        callbacks = []

    history = model.fit(
        train_set,
        validation_data=val_set,
        epochs=epochs,
        #shuffle=False,
        callbacks=callbacks,
        verbose=1
    )
    return history

In [80]:
def evaluate_model(model, test_set, verbose=True, seed=None):
    """Evaluate model on test set and return metrics."""
    test_loss, test_roc_auc, test_pr_auc = model.evaluate(test_set, verbose=0)

    metrics = {
        'loss': test_loss,
        'roc_auc': test_roc_auc,
        'pr_auc': test_pr_auc
    }


    if verbose:
        print("\n" + "="*50)
        print(f"\n=== Running Trial with seed {seed} ===")
        print("-"*50)
        print(f"Test Loss:       {test_loss:.4f}")
        print(f"Test ROC-AUC:    {test_roc_auc:.4f}")
        print(f"Test PR-AUC:     {test_pr_auc:.4f}")
        print("="*50 + "\n")

    return metrics

In [81]:
def run_experiment(seed):
    """Complete training pipeline for one seed"""
    print(f"\n=== Running Trial with seed {seed} ===")
    set_seeds(seed)

    # 1. Data preparation
    train_set, val_set, test_set = data_preprocessing(seed)

    # 2. Text vectorization
    train_text = pd.read_csv('train.csv')['FASTA']
    text_vec_layer = build_text_vectorizer(train_text)

    # 3. Model building
    model = build_improved_model(text_vec_layer)

    # 4. Training
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint("best_model.weights.h5", save_best_only=True, save_weights_only=True),
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]

    history = train_model(model, train_set, val_set, callbacks)

    # 5. Evaluation
    metrics = evaluate_model(model, test_set, seed=seed)

    return history, metrics

In [71]:
run_experiment(42)


=== Running Trial with seed 42 ===
Epoch 1/30




[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - loss: 0.6818 - pr_auc: 0.6671 - roc_auc: 0.6604 - val_loss: 0.6356 - val_pr_auc: 0.8014 - val_roc_auc: 0.7845
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - loss: 0.5738 - pr_auc: 0.7930 - roc_auc: 0.7720 - val_loss: 0.6191 - val_pr_auc: 0.8053 - val_roc_auc: 0.7880
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 74ms/step - loss: 0.4926 - pr_auc: 0.8588 - roc_auc: 0.8442 - val_loss: 0.5679 - val_pr_auc: 0.8085 - val_roc_auc: 0.8029
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: 0.4397 - pr_auc: 0.8904 - roc_auc: 0.8777 - val_loss: 0.5674 - val_pr_auc: 0.8299 - val_roc_auc: 0.8229
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - loss: 0.4186 - pr_auc: 0.8972 - roc_auc: 0.8931 - val_loss: 0.5424 - val_pr_auc: 0.8330 - val_roc_auc: 0.8343
Epoch 6/30
[1m59/59[0m 

(<keras.src.callbacks.history.History at 0x7bee8ef66990>,
 {'loss': 0.4664892256259918,
  'roc_auc': 0.8454327583312988,
  'pr_auc': 0.4979817867279053})

Structure your code as a complete, end-to-end, and reproducible program
and using a parameter seed for all stochastic components (e.g.,
```random.seed(seed), np.random.seed(seed), tf.random.set_seed(seed)
```, and
```random_state=seed```in
```<train_test_split()```, etc.). Run the training and evaluation process
over 5 trials using seeds from 1 to 5. Report the mean and standard
deviation of both ROC AUC and PR AUC scores on the test set.

In [82]:
def run_experiment(seed):
    """Complete training pipeline for one seed"""
    print(f"\n=== Running Trial with seed {seed} ===")
    set_seeds(seed)

    train_set, val_set, test_set = data_preprocessing(seed)
    train_text = pd.read_csv('train.csv')['FASTA']
    text_vec_layer = build_text_vectorizer(train_text)
    model = build_improved_model(text_vec_layer)
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint("best_model.weights.h5", save_best_only=True, save_weights_only=True),
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
    history = train_model(model, train_set, val_set, callbacks)

    # Hide the evalutaion results for each seed by setting verbose as false
    metrics = evaluate_model(model, test_set, verbose=False)

    return history, metrics

In [None]:
# Execution of reproducible results
SEEDS = [1,2,3,4,5]
roc_auc_scores = []
pr_auc_scores = []

for seed in SEEDS:
    _, metrics = run_experiment(seed)
    roc_auc_scores.append(metrics['roc_auc'])
    pr_auc_scores.append(metrics['pr_auc'])


=== Running Trial with seed 1 ===
Epoch 1/30




[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 95ms/step - loss: 0.6944 - pr_auc: 0.6907 - roc_auc: 0.6750 - val_loss: 0.6583 - val_pr_auc: 0.7926 - val_roc_auc: 0.7674
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - loss: 0.5732 - pr_auc: 0.7947 - roc_auc: 0.7736 - val_loss: 0.6285 - val_pr_auc: 0.8156 - val_roc_auc: 0.7923
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - loss: 0.5189 - pr_auc: 0.8412 - roc_auc: 0.8256 - val_loss: 0.6172 - val_pr_auc: 0.8214 - val_roc_auc: 0.8047
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - loss: 0.4913 - pr_auc: 0.8503 - roc_auc: 0.8395 - val_loss: 0.5676 - val_pr_auc: 0.8371 - val_roc_auc: 0.8238
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - loss: 0.3765 - pr_auc: 0.9200 - roc_auc: 0.9151 - val_loss: 0.5811 - val_pr_auc: 0.8433 - val_roc_auc: 0.8332
Epoch 6/30
[1m59/59[0m 



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - loss: 0.7232 - pr_auc: 0.6003 - roc_auc: 0.5887 - val_loss: 0.6556 - val_pr_auc: 0.7510 - val_roc_auc: 0.7625
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 55ms/step - loss: 0.5756 - pr_auc: 0.7916 - roc_auc: 0.7745 - val_loss: 0.6096 - val_pr_auc: 0.8091 - val_roc_auc: 0.7941
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - loss: 0.4931 - pr_auc: 0.8490 - roc_auc: 0.8437 - val_loss: 0.5839 - val_pr_auc: 0.7933 - val_roc_auc: 0.8045
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - loss: 0.4501 - pr_auc: 0.8767 - roc_auc: 0.8763 - val_loss: 0.5253 - val_pr_auc: 0.8143 - val_roc_auc: 0.8337
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - loss: 0.4037 - pr_auc: 0.8990 - roc_auc: 0.9000 - val_loss: 0.6021 - val_pr_auc: 0.8187 - val_roc_auc: 0.8352
Epoch 6/30
[1m59/59[0m 



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 89ms/step - loss: 0.7067 - pr_auc: 0.6222 - roc_auc: 0.6149 - val_loss: 0.6367 - val_pr_auc: 0.7975 - val_roc_auc: 0.7792
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - loss: 0.5697 - pr_auc: 0.7787 - roc_auc: 0.7770 - val_loss: 0.5923 - val_pr_auc: 0.8194 - val_roc_auc: 0.8034
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - loss: 0.4895 - pr_auc: 0.8546 - roc_auc: 0.8455 - val_loss: 0.6076 - val_pr_auc: 0.8142 - val_roc_auc: 0.8099
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - loss: 0.4428 - pr_auc: 0.8876 - roc_auc: 0.8771 - val_loss: 0.5313 - val_pr_auc: 0.8233 - val_roc_auc: 0.8182
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - loss: 0.4362 - pr_auc: 0.8885 - roc_auc: 0.8791 - val_loss: 0.5235 - val_pr_auc: 0.8219 - val_roc_auc: 0.8221
Epoch 6/30
[1m59/59[0m 



[1m33/59[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1s[0m 53ms/step - loss: 0.7886 - pr_auc: 0.5629 - roc_auc: 0.5542

In [None]:
print("\nResults of 5 trials")
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.3f}")
print(f"Standard Deviation ROC-AUC: {np.std(roc_auc_scores):.3f}")
print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.3f}")
print(f"Standard Deviation PR-AUC: {np.std(pr_auc_scores):.3f}")


Results of 5 trials
Mean ROC-AUC: 0.815
Standard Deviation ROC-AUC: 0.013
Mean PR-AUC: 0.364
Standard Deviation PR-AUC: 0.016


Discussion:
Include a discussion at the end of your notebook (about what you have
learned, difficulties, what has worked and not worked, future directions,
etc.).