In [38]:
# Data Loading and Preprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2

print("--- Data Loading and Preprocessing ---")

def one_hot_encode_sequence(sequence, max_len):
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded = [mapping.get(char, [0, 0, 0, 0]) for char in sequence]
    encoded = np.array(encoded, dtype=np.float32)
    if len(encoded) < max_len:
        pad = np.zeros((max_len - len(encoded), 4), dtype=np.float32)
        encoded = np.vstack((encoded, pad))
    return encoded[:max_len]

train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw, y_train = train_df[0].values, train_df[1].values
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw, y_test = test_df[0].values, test_df[1].values

sequence_length = len(X_train_raw[0])
X_train = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_train_raw])
X_test = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_test_raw])
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

--- Data Loading and Preprocessing ---
Train shape: (28140, 1000, 4) Test shape: (2000, 1000, 4)


In [None]:
input_shape = (sequence_length, 4)
NUM_KERNELS_CNN = 64
KERNEL_LENGTH_CNN = 9
LSTM_NEURONS = 20
DROPOUT_RATE = 0.5
LEARNING_RATE = 0.001
EPOCHS = 50
BATCH_SIZE = 64
VALIDATION_SPLIT = 0.2

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [40]:
# Model 1 	Model 1 – Simple CNN
from tensorflow.keras.regularizers import l2

model1 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.0001)), # Added L2
    MaxPooling1D(2),
    Dropout(DROPOUT_RATE), # Could increase this
    Flatten(),
    Dense(64, activation='relu', kernel_regularizer=l2(0.0001)), # Added L2
    Dropout(DROPOUT_RATE), # Could increase this
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model1.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model1.evaluate(X_test, y_test, verbose=0)
print("Model 1 Test Accuracy:", accuracy)

Epoch 1/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step - accuracy: 0.6090 - loss: 0.6995 - val_accuracy: 0.0000e+00 - val_loss: 1.1455
Epoch 2/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.7129 - loss: 0.6138 - val_accuracy: 0.0000e+00 - val_loss: 1.3080
Epoch 3/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.7183 - loss: 0.6051 - val_accuracy: 0.0000e+00 - val_loss: 1.0250
Epoch 4/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.7120 - loss: 0.5916 - val_accuracy: 8.2919e-04 - val_loss: 1.2984
Epoch 5/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.7338 - loss: 0.5721 - val_accuracy: 0.0086 - val_loss: 1.6196
Epoch 6/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.7510 - loss: 0.5631 - val_accuracy: 0.3787 - val_loss: 0.9739


KeyboardInterrupt: 

In [None]:
# Model 2 Model 2 – Deeper CNN
model2 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(2), Dropout(DROPOUT_RATE),
    Conv1D(NUM_KERNELS_CNN * 2, KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(2), Dropout(DROPOUT_RATE), Flatten(),
    Dense(128, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
    
])
model2.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model2.evaluate(X_test, y_test, verbose=0)
print("Model 2 Test Accuracy:", accuracy)

Epoch 1/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 97ms/step - accuracy: 0.6156 - loss: 0.7049 - val_accuracy: 0.0000e+00 - val_loss: 1.2947
Epoch 2/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 120ms/step - accuracy: 0.7114 - loss: 0.5986 - val_accuracy: 0.0000e+00 - val_loss: 1.0351
Epoch 3/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 117ms/step - accuracy: 0.7172 - loss: 0.5656 - val_accuracy: 0.8239 - val_loss: 0.5984
Epoch 4/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 116ms/step - accuracy: 0.7429 - loss: 0.5216 - val_accuracy: 0.0779 - val_loss: 1.2512
Epoch 5/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 115ms/step - accuracy: 0.7475 - loss: 0.5139 - val_accuracy: 1.0000 - val_loss: 0.1601
Epoch 6/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 120ms/step - accuracy: 0.7440 - loss: 0.5203 - val_accuracy: 0.9203 - val_loss: 0.4074
Epo

In [None]:
# Model 3 	Model 3 – Simple LSTM e20
model3 = Sequential([
    Input(shape=input_shape),
    LSTM(LSTM_NEURONS), Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model3.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model3.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model3.evaluate(X_test, y_test, verbose=0)
print("Model 3 Test Accuracy:", accuracy)


Epoch 1/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 91ms/step - accuracy: 0.6036 - loss: 0.7117 - val_accuracy: 0.0000e+00 - val_loss: 1.2168
Epoch 2/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 87ms/step - accuracy: 0.7035 - loss: 0.6151 - val_accuracy: 0.0000e+00 - val_loss: 1.2441
Epoch 3/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 87ms/step - accuracy: 0.7193 - loss: 0.5976 - val_accuracy: 0.0000e+00 - val_loss: 1.2207
Epoch 4/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 88ms/step - accuracy: 0.7135 - loss: 0.6001 - val_accuracy: 0.0000e+00 - val_loss: 1.2485
Epoch 5/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 87ms/step - accuracy: 0.7154 - loss: 0.5981 - val_accuracy: 0.0000e+00 - val_loss: 1.2551
Epoch 6/20
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 87ms/step - accuracy: 0.7127 - loss: 0.6003 - val_accuracy: 0.0000e+00 - val_loss:

In [None]:
# Model 4 Model 4 – Deeper LSTM
# epoch set to as this model is very slow causing problems and heating up my latop
model4 = Sequential([
    Input(shape=input_shape),
    LSTM(10, return_sequences=True), Dropout(DROPOUT_RATE),
    LSTM(10), Dropout(DROPOUT_RATE),
    Dense(128, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model4.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model4.fit(X_train, y_train, epochs=5, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model4.evaluate(X_test, y_test, verbose=0)
print("Model 4 Test Accuracy:", accuracy)

Epoch 1/5


KeyboardInterrupt: 

In [None]:
# Model 5 CNN-LSTM Hybrid
model5 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(2), Dropout(DROPOUT_RATE),
    LSTM(LSTM_NEURONS), Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model5.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model5.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model5.evaluate(X_test, y_test, verbose=0)
print("Model 5 Test Accuracy:", accuracy)

Epoch 1/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 85ms/step - accuracy: 0.5972 - loss: 0.7100 - val_accuracy: 0.0000e+00 - val_loss: 1.2039
Epoch 2/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 85ms/step - accuracy: 0.7049 - loss: 0.6163 - val_accuracy: 0.0000e+00 - val_loss: 1.2067
Epoch 3/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.7138 - loss: 0.6042 - val_accuracy: 0.0000e+00 - val_loss: 1.2071
Epoch 4/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.7150 - loss: 0.6004 - val_accuracy: 0.0000e+00 - val_loss: 1.2166
Epoch 5/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.7153 - loss: 0.5988 - val_accuracy: 0.0000e+00 - val_loss: 1.2414
Epoch 6/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.7167 - loss: 0.5967 - val_accuracy: 0.0000e+00 - val_loss:

In [None]:
# Model 6 CNN with More Dense Layers
model6 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(2), Dropout(DROPOUT_RATE), Flatten(),
    Dense(128, activation='relu'), Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model6.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model6.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model6.evaluate(X_test, y_test, verbose=0)
print("Model 6 Test Accuracy:", accuracy)

Epoch 1/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.5873 - loss: 0.7233 - val_accuracy: 0.0000e+00 - val_loss: 1.2352
Epoch 2/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.7031 - loss: 0.6179 - val_accuracy: 0.0000e+00 - val_loss: 1.2672
Epoch 3/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - accuracy: 0.7116 - loss: 0.6071 - val_accuracy: 0.0000e+00 - val_loss: 1.2660
Epoch 4/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.7108 - loss: 0.6064 - val_accuracy: 0.0000e+00 - val_loss: 1.3412
Epoch 5/50
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.7163 - loss: 0.6003

KeyboardInterrupt: 

In [None]:
# Model 7 LSTM with More Dense Layers
model7 = Sequential([
    Input(shape=input_shape),
    LSTM(LSTM_NEURONS), Dropout(DROPOUT_RATE),
    Dense(128, activation='relu'), Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'), Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model7.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model7.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT, callbacks=[early_stopping], verbose=1)
loss, accuracy = model7.evaluate(X_test, y_test, verbose=0)
print("Model 7 Test Accuracy:", accuracy)

Epoch 1/50
[1m 49/308[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m20s[0m 79ms/step - accuracy: 0.5124 - loss: 0.8128

KeyboardInterrupt: 