<a href="https://colab.research.google.com/github/leahyye/DATA305-Project/blob/main/Versions_Project_LeahYe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA305 - Project

* Student Name: Leah Ye
* Student ID: 300651931
* Course code: DATA 305


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Dense, Attention
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

Develop a fully functional model trained on the data provided in train.csv. If
a validation set is needed, it should be created as a subset of this training
data. Evaluate the model using test.csv, and report both the Area Under the
Receiver Operating Characteristic Curve (ROC AUC) and the Area Under the
Precision-Recall Curve (PR AUC) on the test set.

In [None]:
def set_seeds(seed):
  os.environ["PYTHONHASHSEED"] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)
  os.environ["TF_DETERMINISTIC_OPS"] = "1"

In [None]:
set_seeds(42)

In [None]:
# Load the data
from google.colab import files
import pandas as pd
uploaded = files.upload()

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
print(f"Loaded {len(train_set)} rows")
print(f"Loaded {len(test_set)} rows")

Saving test.csv to test.csv
Saving train.csv to train.csv
Loaded 2352 rows
Loaded 3370 rows


In [None]:
def data_preprocessing(seed, train_path='train.csv', test_path='test.csv'):
    """
    Load and split data into train/val/test sets
    Returns: (train_set, val_set, test_set)
    """
    # Load data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    X_train = train['FASTA']
    y_train = train['label']

    # Split train into train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=seed,
        stratify=y_train
    )

    # Create TensorFlow Datasets
    train_set = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    train_set = train_set.shuffle(5000, seed=seed).batch(32).prefetch(1)

    val_set = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
    val_set = val_set.batch(32).prefetch(1)

    test_set = tf.data.Dataset.from_tensor_slices((test['FASTA'].values, test['label'].values))
    test_set = test_set.batch(32).prefetch(1)

    return train_set, val_set, test_set

In [None]:
def build_text_vectorizer(train_text, max_tokens=30, seq_length=50):
    """Create and adapt text vectorization layer"""
    text_vec_layer = TextVectorization(
        max_tokens=max_tokens,
        output_sequence_length=seq_length,
        split="character",
        pad_to_max_tokens=True
    )
    text_vec_layer.adapt(train_text)
    return text_vec_layer

In [None]:
class LNSimpleRNNCell(tf.keras.layers.Layer):
    def __init__(self, units, activation="tanh", **kwargs):
        super().__init__(**kwargs)
        self.state_size = units
        self.output_size = units
        self.simple_rnn_cell = tf.keras.layers.SimpleRNNCell(units,
                                                             activation=None)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.activation = tf.keras.activations.get(activation)

    def call(self, inputs, states):
        outputs, new_states = self.simple_rnn_cell(inputs, states)
        norm_outputs = self.activation(self.layer_norm(outputs))
        return norm_outputs, [norm_outputs]


In [None]:
class Attention(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def call(self, inputs):
        score = tf.nn.tanh(inputs)  # shape: (batch_size, timesteps, features)
        weights = tf.nn.softmax(score, axis=1)  # attention weights
        context = tf.reduce_sum(inputs * weights, axis=1)  # weighted sum
        return context
# might take this out if im not using it

In [None]:
# BEST MODEL USE THIS ONE
# Testing: remove max pooling
# Remove batching norm + increase learning rate + decrease epochs: worse results

def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        # tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        #tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),
        tf.keras.layers.GRU(64, return_sequences=True),
        # Attention(),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),

        # tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        # tf.keras.layers.LayerNormalization(),
        #tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [None]:
# Fourth BEST MODEL
# Mean ROC-AUC: 0.821
# Standard Deviation ROC-AUC: 0.014
# Mean PR-AUC: 0.470
# Standard Deviation PR-AUC: 0.029

def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),

        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),
        #tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        #tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001, clipnorm=1.0),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [None]:
# Third best Model
# adding more dense layer
# Mean ROC-AUC: 0.823
# Standard Deviation ROC-AUC: 0.015
# Mean PR-AUC: 0.461
# Standard Deviation PR-AUC: 0.036

def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),

        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),

        #tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        #tf.keras.layers.Dropout(0.3),
        #tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001, clipnorm=1.0),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [None]:
# Second BEST MODEL
# Mean ROC-AUC: 0.827
# Standard Deviation ROC-AUC: 0.015
# Mean PR-AUC: 0.459
# Standard Deviation PR-AUC: 0.037


def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),

        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),

        # tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        # tf.keras.layers.Dropout(0.3),
        #tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [None]:
# BEST MODEL
#Mean ROC-AUC: 0.827
#Standard Deviation ROC-AUC: 0.006
#Mean PR-AUC: 0.475
#Standard Deviation PR-AUC: 0.026

def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),

        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.MaxPooling1D(pool_size=2),

        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),

        tf.keras.layers.GRU(64, return_sequences=True),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),

        # tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        # tf.keras.layers.Dropout(0.3),
        #tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [None]:
def train_model(model, train_set, val_set, callbacks, epochs=30):
    """Train model and return training history"""
    if callbacks is None:
        callbacks = []

    history = model.fit(
        train_set,
        validation_data=val_set,
        epochs=epochs,
        #shuffle=False,
        callbacks=callbacks,
        verbose=1
    )
    return history

In [None]:
def evaluate_model(model, test_set, verbose=True):
    """Evaluate model on test set and return metrics."""
    test_loss, test_roc_auc, test_pr_auc = model.evaluate(test_set, verbose=0)

    metrics = {
        'loss': test_loss,
        'roc_auc': test_roc_auc,
        'pr_auc': test_pr_auc
    }


    if verbose:
        print("\n" + "="*50)
        # print(f"\n=== Running Trial with seed {seed} ===")
        print("-"*50)
        print(f"Test Loss:       {test_loss:.4f}")
        print(f"Test ROC-AUC:    {test_roc_auc:.4f}")
        print(f"Test PR-AUC:     {test_pr_auc:.4f}")
        print("="*50 + "\n")

    return metrics

In [None]:
def run_experiment(seed):
    """Complete training pipeline for one seed"""
    print(f"\n=== Running Trial with seed {seed} ===")
    set_seeds(seed)

    train_set, val_set, test_set = data_preprocessing(seed)
    train_text = pd.read_csv('train.csv')['FASTA']
    text_vec_layer = build_text_vectorizer(train_text)
    model = build_improved_model(text_vec_layer)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_roc_auc',
        mode='max',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
    )

    checkpoint_cb = ModelCheckpoint(
        "best_model.weights.h5",
        monitor='val_roc_auc',
        mode='max',
        save_best_only=True,
        save_weights_only=True,
    )

    early_stopping_cb = EarlyStopping(
        monitor='val_roc_auc',
        mode='max',
        patience=3,
        restore_best_weights=True,
    )

    callbacks = [checkpoint_cb, early_stopping_cb, reduce_lr]
    history = train_model(model, train_set, val_set, callbacks)

    metrics = evaluate_model(model, test_set, seed)

    return history, metrics

In [None]:
# Execution of reproducible results
SEEDS = [1,2,3,4,5]
roc_auc_scores = []
pr_auc_scores = []

for seed in SEEDS:
    _, metrics = run_experiment(seed)
    roc_auc_scores.append(metrics['roc_auc'])
    pr_auc_scores.append(metrics['pr_auc'])


=== Running Trial with seed 1 ===
Epoch 1/30




[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 101ms/step - loss: 0.6476 - pr_auc: 0.6845 - roc_auc: 0.6712 - val_loss: 0.6021 - val_pr_auc: 0.7781 - val_roc_auc: 0.7560 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 93ms/step - loss: 0.5737 - pr_auc: 0.7959 - roc_auc: 0.7788 - val_loss: 0.5630 - val_pr_auc: 0.7990 - val_roc_auc: 0.7759 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - loss: 0.5185 - pr_auc: 0.8392 - roc_auc: 0.8198 - val_loss: 0.5263 - val_pr_auc: 0.8406 - val_roc_auc: 0.8220 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 81ms/step - loss: 0.4704 - pr_auc: 0.8574 - roc_auc: 0.8538 - val_loss: 0.5197 - val_pr_auc: 0.8403 - val_roc_auc: 0.8240 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - loss: 0.4347 - pr_auc: 0.8863 - roc_a



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 99ms/step - loss: 0.6533 - pr_auc: 0.6626 - roc_auc: 0.6610 - val_loss: 0.6081 - val_pr_auc: 0.7442 - val_roc_auc: 0.7260 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 80ms/step - loss: 0.5684 - pr_auc: 0.7930 - roc_auc: 0.7822 - val_loss: 0.5981 - val_pr_auc: 0.7948 - val_roc_auc: 0.7837 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - loss: 0.5246 - pr_auc: 0.8354 - roc_auc: 0.8160 - val_loss: 0.5306 - val_pr_auc: 0.8252 - val_roc_auc: 0.8119 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 122ms/step - loss: 0.4779 - pr_auc: 0.8627 - roc_auc: 0.8544 - val_loss: 0.5050 - val_pr_auc: 0.8311 - val_roc_auc: 0.8385 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 109ms/step - loss: 0.4344 - pr_auc: 0.8851 - roc_

KeyboardInterrupt: 

In [None]:
print("\nResults of 5 trials")
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.3f}")
print(f"Standard Deviation ROC-AUC: {np.std(roc_auc_scores):.3f}")
print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.3f}")
print(f"Standard Deviation PR-AUC: {np.std(pr_auc_scores):.3f}")

Discussion:
Include a discussion at the end of your notebook (about what you have
learned, difficulties, what has worked and not worked, future directions,
etc.).