In [1]:
 # Data Loading and Preprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # Added ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2

print("--- Data Loading and Preprocessing ---")

def one_hot_encode_sequence(sequence, max_len):
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded = [mapping.get(char, [0, 0, 0, 0]) for char in sequence]
    encoded = np.array(encoded, dtype=np.float32)
    if len(encoded) < max_len:
        pad = np.zeros((max_len - len(encoded), 4), dtype=np.float32)
        encoded = np.vstack((encoded, pad))
    return encoded[:max_len]

train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw, y_train = train_df[0].values, train_df[1].values
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw, y_test = test_df[0].values, test_df[1].values

sequence_length = len(X_train_raw[0])
X_train = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_train_raw])
X_test = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_test_raw])
print("Train shape:", X_train.shape, "Test shape:", X_test.shape) # Outputting shapes to verify tensor preparation [cite: 13]
input_shape = (sequence_length, 4)

--- Data Loading and Preprocessing ---
Train shape: (28140, 1000, 4) Test shape: (2000, 1000, 4)


In [3]:

# These values are adjusted based on typical performance characteristics for long sequences
# and common ranges suggested in the problem statement[cite: 22, 23, 24, 26, 27].

NUM_KERNELS_CNN = 96 # Increased from 64 to 96 (max suggested kernel number) for more capacity in CNNs 
KERNEL_LENGTH_CNN = 15 # Increased from 9 to 15 (max suggested kernel length) to capture longer motifs 
LSTM_NEURONS = 40 # Increased from 20 to 40 (max suggested LSTM neuron number) for more LSTM capacity 
DROPOUT_RATE = 0.3 # Reduced from 0.5 to 0.3. Less aggressive dropout if underfitting, or to find a better balance with L2. 
LEARNING_RATE = 0.0005 # Reduced from 0.001 to 0.0005. Can help with stability and fine-tuning. 
EPOCHS = 100 # Increased from 50 to 100 to give models more time to converge, relying on early stopping 
BATCH_SIZE = 64 # Keeping 64, good balance for CPU.
VALIDATION_SPLIT = 0.2 # Keeping 0.2 for consistency, but 0.3 is also fine. 

# --- Callbacks ---
# Patience increased for early stopping to allow more learning given initial low performance.
# Added ReduceLROnPlateau to dynamically adjust learning rate, which is crucial for CPU training stability and accuracy.
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True) # Increased patience from 10 to 15
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001, verbose=1)


In [4]:
# Model 1 	Model 1 – Simple CNN
# Focus: Balance regularization (L2, Dropout) with capacity to avoid both underfitting and overfitting.
print("\n--- Model 1: Simple CNN ---")
model1 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)), # Smaller L2 for better balance
    MaxPooling1D(2), # All pooling layers are local max pooling layers [cite: 28]
    Dropout(DROPOUT_RATE), # Include dropout in CNN layers 
    Flatten(),
    Dense(64, activation='relu', kernel_regularizer=l2(0.00005)), # Smaller L2
    Dropout(DROPOUT_RATE), # Include dropout in Dense layers 
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model1.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT,
           callbacks=[early_stopping, reduce_lr], verbose=1) # Added reduce_lr
loss, accuracy = model1.evaluate(X_test, y_test, verbose=0)
print(f"Model 1 Test Accuracy: {accuracy:.4f}")


--- Model 1: Simple CNN ---
Epoch 1/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 33ms/step - accuracy: 0.5388 - loss: 0.7529 - val_accuracy: 0.0409 - val_loss: 0.9780 - learning_rate: 5.0000e-04
Epoch 2/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.6453 - loss: 0.6371 - val_accuracy: 0.4954 - val_loss: 0.7535 - learning_rate: 5.0000e-04
Epoch 3/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.7598 - loss: 0.5273 - val_accuracy: 0.5105 - val_loss: 0.8503 - learning_rate: 5.0000e-04
Epoch 4/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.8336 - loss: 0.4106 - val_accuracy: 0.3989 - val_loss: 1.1955 - learning_rate: 5.0000e-04
Epoch 5/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - accuracy: 0.8863 - loss: 0.3254 - val_accuracy: 0.8699 - val_loss: 0.3812 - learning_rate: 5.0000e-04
Epoch 

In [5]:
# Model 2 Model 2 – Deeper CNN
# Focus: Increased depth and capacity.
print("\n--- Model 2: Deeper CNN ---")
model2 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)),
    MaxPooling1D(2), Dropout(DROPOUT_RATE),
    Conv1D(NUM_KERNELS_CNN * 2, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)), # Doubled kernels in second layer
    MaxPooling1D(2), Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.00005)), # Increased Dense neurons 
    Dropout(DROPOUT_RATE),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model2.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT,
           callbacks=[early_stopping, reduce_lr], verbose=1) # Added reduce_lr
loss, accuracy = model2.evaluate(X_test, y_test, verbose=0)
print(f"Model 2 Test Accuracy: {accuracy:.4f}")


--- Model 2: Deeper CNN ---
Epoch 1/100
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 232ms/step - accuracy: 0.5433 - loss: 0.7660 - val_accuracy: 5.3305e-04 - val_loss: 1.2101 - learning_rate: 5.0000e-04
Epoch 2/100
[1m245/352[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m33s[0m 309ms/step - accuracy: 0.6410 - loss: 0.6573

KeyboardInterrupt: 