<a href="https://colab.research.google.com/github/leahyye/DATA305-Project/blob/main/Final_Project_LeahYe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA305 - Project

* Student Name: Leah Ye
* Student ID: 300651931
* Course code: DATA 305


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Attention
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [2]:
def set_seeds(seed):
  os.environ["PYTHONHASHSEED"] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)
  os.environ["TF_DETERMINISTIC_OPS"] = "1"

In [4]:
# Load the data
from google.colab import files
import pandas as pd
uploaded = files.upload()

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
print(f"Loaded {len(train_set)} rows")
print(f"Loaded {len(test_set)} rows")

Saving test.csv to test.csv
Saving train.csv to train.csv
Loaded 2352 rows
Loaded 3370 rows


In [5]:
def data_preprocessing(seed, train_path='train.csv', test_path='test.csv'):
    """
    Load and split data into train/val/test sets
    Returns: (train_set, val_set, test_set)
    """
    # Load data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    X_train = train['FASTA']
    y_train = train['label']

    # Split train into train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=seed,
        stratify=y_train
    )

    # Create TensorFlow Datasets, and applying batching
    train_set = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    train_set = train_set.shuffle(5000, seed=seed).batch(32).prefetch(1)

    val_set = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
    val_set = val_set.batch(32).prefetch(1)

    test_set = tf.data.Dataset.from_tensor_slices((test['FASTA'].values, test['label'].values))
    test_set = test_set.batch(32).prefetch(1)

    return train_set, val_set, test_set

In [6]:
# Text vectorisation
def build_text_vectorizer(train_text, max_tokens=30, seq_length=50):
    """Create and adapt text vectorization layer"""
    text_vec_layer = TextVectorization(
        max_tokens=max_tokens,
        output_sequence_length=seq_length,
        split="character",
        pad_to_max_tokens=True
    )
    text_vec_layer.adapt(train_text)
    return text_vec_layer

In [7]:
class LNSimpleRNNCell(tf.keras.layers.Layer):
    def __init__(self, units, activation="tanh", **kwargs):
        super().__init__(**kwargs)
        self.state_size = units
        self.output_size = units
        self.simple_rnn_cell = tf.keras.layers.SimpleRNNCell(units,
                                                             activation=None)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.activation = tf.keras.activations.get(activation)

    def call(self, inputs, states):
        outputs, new_states = self.simple_rnn_cell(inputs, states)
        norm_outputs = self.activation(self.layer_norm(outputs))
        return norm_outputs, [norm_outputs]

In [14]:
# BEST MODEL
def build_improved_model(text_vec_layer, embed_size=128):
    vocab_size = len(text_vec_layer.get_vocabulary())

    model = tf.keras.Sequential([
        text_vec_layer,                                                                       # Text vectorisation layer
        tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),                    # Embedding layer
        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'), # Conv1D
        tf.keras.layers.RNN(LNSimpleRNNCell(128), return_sequences=True),                     # Simple RNN
        tf.keras.layers.GRU(64, return_sequences=True),                                       # GRU
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x * tf.nn.softmax(x, axis=1), axis=1)),# Attention
        tf.keras.layers.Dense(64, activation='relu'),                                         # Dense layer
        tf.keras.layers.Dense(64, activation='relu'),                                         # Dense layer
        tf.keras.layers.Dense(1, activation="sigmoid")                                        # Output
    ])

    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001),
        metrics=[
            AUC(name='roc_auc', curve='ROC'),
            AUC(name='pr_auc', curve='PR')
        ]
    )
    return model

In [15]:
def train_model(model, train_set, val_set, callbacks, epochs=30):
    if callbacks is None:
        callbacks = []

    history = model.fit(
        train_set,
        validation_data=val_set,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1
    )
    return history

In [16]:
# Evaluate the performance of the model on the test set
def evaluate_model(model, test_set, verbose=True):
    test_loss, test_roc_auc, test_pr_auc = model.evaluate(test_set, verbose=0)

    metrics = {
        'loss': test_loss,
        'roc_auc': test_roc_auc,
        'pr_auc': test_pr_auc
    }

    if verbose:
        print("\n" + "="*50)
        print("-"*50)
        print(f"Test Loss:       {test_loss:.4f}")
        print(f"Test ROC-AUC:    {test_roc_auc:.4f}")
        print(f"Test PR-AUC:     {test_pr_auc:.4f}")
        print("="*50 + "\n")

    return metrics

In [17]:
# Train and fit model for each seed
def run_experiment(seed):
    print(f"\n=== Running Trial with seed {seed} ===")
    set_seeds(seed)

    train_set, val_set, test_set = data_preprocessing(seed)
    train_text = pd.read_csv('train.csv')['FASTA']
    text_vec_layer = build_text_vectorizer(train_text)
    model = build_improved_model(text_vec_layer)

    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_roc_auc',
        mode='max',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
    )

    checkpoint_cb = ModelCheckpoint(
        "best_model.weights.h5",
        monitor='val_roc_auc',
        mode='max',
        save_best_only=True,
        save_weights_only=True,
    )

    early_stopping_cb = EarlyStopping(
        monitor='val_roc_auc',
        mode='max',
        patience=3,
        restore_best_weights=True,
    )

    callbacks = [checkpoint_cb, early_stopping_cb, reduce_lr]
    history = train_model(model, train_set, val_set, callbacks)

    metrics = evaluate_model(model, test_set, seed)

    return history, metrics

In [18]:
# Execution of reproducible results
SEEDS = [1,2,3,4,5]
roc_auc_scores = []
pr_auc_scores = []

for seed in SEEDS:
    _, metrics = run_experiment(seed)
    roc_auc_scores.append(metrics['roc_auc'])
    pr_auc_scores.append(metrics['pr_auc'])


=== Running Trial with seed 1 ===
Epoch 1/30




[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 129ms/step - loss: 0.6720 - pr_auc: 0.6309 - roc_auc: 0.5899 - val_loss: 0.6134 - val_pr_auc: 0.7826 - val_roc_auc: 0.7717 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 114ms/step - loss: 0.6039 - pr_auc: 0.7660 - roc_auc: 0.7217 - val_loss: 0.5475 - val_pr_auc: 0.8162 - val_roc_auc: 0.7810 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 85ms/step - loss: 0.5191 - pr_auc: 0.8401 - roc_auc: 0.8155 - val_loss: 0.5017 - val_pr_auc: 0.8623 - val_roc_auc: 0.8485 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 103ms/step - loss: 0.4926 - pr_auc: 0.8430 - roc_auc: 0.8398 - val_loss: 0.4998 - val_pr_auc: 0.8613 - val_roc_auc: 0.8496 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 85ms/step - loss: 0.4364 - pr_auc: 0.8957 - roc_



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 107ms/step - loss: 0.6863 - pr_auc: 0.5624 - roc_auc: 0.5469 - val_loss: 0.6258 - val_pr_auc: 0.7148 - val_roc_auc: 0.7191 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 112ms/step - loss: 0.6012 - pr_auc: 0.7433 - roc_auc: 0.7365 - val_loss: 0.7033 - val_pr_auc: 0.8166 - val_roc_auc: 0.8027 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - loss: 0.5543 - pr_auc: 0.8162 - roc_auc: 0.8100 - val_loss: 0.5004 - val_pr_auc: 0.8434 - val_roc_auc: 0.8466 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - loss: 0.4535 - pr_auc: 0.8797 - roc_auc: 0.8669 - val_loss: 0.5263 - val_pr_auc: 0.8357 - val_roc_auc: 0.8411 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step - loss: 0.4527 - pr_auc: 0.8782 - roc



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 107ms/step - loss: 0.6761 - pr_auc: 0.5973 - roc_auc: 0.5758 - val_loss: 0.5920 - val_pr_auc: 0.7652 - val_roc_auc: 0.7454 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 112ms/step - loss: 0.5950 - pr_auc: 0.7578 - roc_auc: 0.7284 - val_loss: 0.5979 - val_pr_auc: 0.8170 - val_roc_auc: 0.7979 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 87ms/step - loss: 0.5362 - pr_auc: 0.8190 - roc_auc: 0.8070 - val_loss: 0.5300 - val_pr_auc: 0.8541 - val_roc_auc: 0.8418 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 111ms/step - loss: 0.4500 - pr_auc: 0.8831 - roc_auc: 0.8711 - val_loss: 0.4874 - val_pr_auc: 0.8571 - val_roc_auc: 0.8529 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 92ms/step - loss: 0.4522 - pr_auc: 0.8740 - roc_



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 118ms/step - loss: 0.6855 - pr_auc: 0.5601 - roc_auc: 0.5487 - val_loss: 0.6900 - val_pr_auc: 0.6396 - val_roc_auc: 0.6270 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 88ms/step - loss: 0.6733 - pr_auc: 0.6300 - roc_auc: 0.6543 - val_loss: 0.6481 - val_pr_auc: 0.7202 - val_roc_auc: 0.6980 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 113ms/step - loss: 0.6204 - pr_auc: 0.7003 - roc_auc: 0.7130 - val_loss: 0.6604 - val_pr_auc: 0.7855 - val_roc_auc: 0.7597 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 87ms/step - loss: 0.6281 - pr_auc: 0.7078 - roc_auc: 0.7157 - val_loss: 0.6079 - val_pr_auc: 0.8261 - val_roc_auc: 0.8079 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 113ms/step - loss: 0.4993 - pr_auc: 0.8484 - roc_



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 121ms/step - loss: 0.6739 - pr_auc: 0.6083 - roc_auc: 0.6002 - val_loss: 0.6221 - val_pr_auc: 0.7399 - val_roc_auc: 0.7191 - learning_rate: 0.0010
Epoch 2/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - loss: 0.6033 - pr_auc: 0.7577 - roc_auc: 0.7335 - val_loss: 0.5763 - val_pr_auc: 0.8012 - val_roc_auc: 0.7830 - learning_rate: 0.0010
Epoch 3/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - loss: 0.5638 - pr_auc: 0.7760 - roc_auc: 0.7847 - val_loss: 0.5299 - val_pr_auc: 0.8314 - val_roc_auc: 0.8205 - learning_rate: 0.0010
Epoch 4/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 117ms/step - loss: 0.4839 - pr_auc: 0.8374 - roc_auc: 0.8469 - val_loss: 0.5286 - val_pr_auc: 0.8334 - val_roc_auc: 0.8344 - learning_rate: 0.0010
Epoch 5/30
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 88ms/step - loss: 0.4631 - pr_auc: 0.8735 - roc_a

In [19]:
print("\nResults of 5 trials")
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.3f}")
print(f"Standard Deviation ROC-AUC: {np.std(roc_auc_scores):.3f}")
print(f"Mean PR-AUC: {np.mean(pr_auc_scores):.3f}")
print(f"Standard Deviation PR-AUC: {np.std(pr_auc_scores):.3f}")


Results of 5 trials
Mean ROC-AUC: 0.831
Standard Deviation ROC-AUC: 0.010
Mean PR-AUC: 0.480
Standard Deviation PR-AUC: 0.026


Discussion:
*   I learnt how to develop a functional model trained on sequence data and developing it through different deep neural networks including: CNN, RNN, attention mechanisms, GRU, bidirectional, embedding, and using masking.
*   Data preprocessing was a bit of a challenge because I had not dealt with sequence data before. So data preprocessing took a while.
*   To test out the model I started off with a simple model and added more complexity(layers) with the aim of improving its ROC-AUC. This worked well because it allowed me to see which parameters and layers overfitted the model.
* Trying to improve the ROC-AUC was the most difficult part. Simply adjusting the learning rate, and input parameters for the model layers didn't seem to do the job.
* In the future, it may be worth researching into what type of data/output the different types of neural networks work well with depending on the type of performance metric we are trying to optimise. Next time, I will go onto the TensorFlow website to investigate before fitting models through trial and error.



