In [7]:
# Data Loading and Preprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2

print("--- Data Loading and Preprocessing ---")

def one_hot_encode_sequence(sequence, max_len):
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded = [mapping.get(char, [0, 0, 0, 0]) for char in sequence]
    encoded = np.array(encoded, dtype=np.float32)
    if len(encoded) < max_len:
        pad = np.zeros((max_len - len(encoded), 4), dtype=np.float32)
        encoded = np.vstack((encoded, pad))
    return encoded[:max_len]

train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw, y_train = train_df[0].values, train_df[1].values
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw, y_test = test_df[0].values, test_df[1].values

sequence_length = len(X_train_raw[0])
X_train = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_train_raw])
X_test = np.array([one_hot_encode_sequence(seq, sequence_length) for seq in X_test_raw])
print("Train shape:", X_train.shape, "Test shape:", X_test.shape) # Outputting shapes to verify tensor preparation [cite: 13]
input_shape = (sequence_length, 4)


--- Data Loading and Preprocessing ---
Train shape: (28140, 1000, 4) Test shape: (2000, 1000, 4)


In [8]:
# --- Hyperparameter Adjustments for Improved Accuracy on CPU ---
# These values are adjusted based on typical performance characteristics for long sequences
# and common ranges suggested in the problem statement[cite: 22, 23, 24, 26, 27].

NUM_KERNELS_CNN = 96 # Keeping 96 for CNNs, they are less bottlenecked by parameters
KERNEL_LENGTH_CNN = 15 # Keeping 15 for CNNs
# LSTM_NEURONS will be adjusted per model to reduce computation for LSTM-intensive models
# DROPOUT_RATE = 0.3 # Keep this for now, it's a good balance.
LEARNING_RATE = 0.0001 # Keeping 0.0005 for stability.
EPOCHS = 100 # Keep 100, early stopping will manage.
BATCH_SIZE = 64 # Keeping 64, good balance for CPU.
VALIDATION_SPLIT = 0.2 # Keeping 0.2 for consistency.

# --- Callbacks ---
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001, verbose=1)

print("\n--- Training Models ---")


--- Training Models ---


In [11]:
# Model 1 	Model 1 – Simple CNN
print("\n--- Model 1: Simple CNN ---")
model1 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)),
    MaxPooling1D(2), # All pooling layers are local max pooling layers [cite: 28]
    Dropout(0.3), # Using the general DROPOUT_RATE
    Flatten(),
    Dense(64, activation='relu', kernel_regularizer=l2(0.00005)),
    Dropout(0.3), # Using the general DROPOUT_RATE
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model1.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT,
           callbacks=[early_stopping, reduce_lr], verbose=1)
loss, accuracy = model1.evaluate(X_test, y_test, verbose=0)
print(f"Model 1 Test Accuracy: {accuracy:.4f}")


--- Model 1: Simple CNN ---
Epoch 1/100
[1m 29/352[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 31ms/step - accuracy: 0.4938 - loss: 0.8497

KeyboardInterrupt: 

In [None]:
# Model 2 Model 2 – Deeper CNN
print("\n--- Model 2: Deeper CNN ---")
model2 = Sequential([
    Input(shape=input_shape),
    Conv1D(NUM_KERNELS_CNN, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)),
    MaxPooling1D(2), Dropout(0.3),
    Conv1D(NUM_KERNELS_CNN * 2, KERNEL_LENGTH_CNN, activation='relu', kernel_regularizer=l2(0.00005)),
    MaxPooling1D(2), Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.00005)),
    Dropout(0.3),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model2.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT,
           callbacks=[early_stopping, reduce_lr], verbose=1)
loss, accuracy = model2.evaluate(X_test, y_test, verbose=0)
print(f"Model 2 Test Accuracy: {accuracy:.4f}")


--- Model 2: Deeper CNN ---


NameError: name 'KERNEL_LENGTH_CNN_BASE' is not defined

In [None]:
# Model 3 	Model 3 – Simple LSTM
# Reduced LSTM neurons to 30 for this simple LSTM, keeping bidirectional 
print("\n--- Model 3: Simple LSTM ---")
model3 = Sequential([
    Input(shape=input_shape),
    Bidirectional(LSTM(30)), # Reduced LSTM neurons
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.00005)),
    Dropout(0.3),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])
model3.compile(optimizer=Adam(LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])
model3.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=VALIDATION_SPLIT,
           callbacks=[early_stopping, reduce_lr], verbose=1)
loss, accuracy = model3.evaluate(X_test, y_test, verbose=0)
print(f"Model 3 Test Accuracy: {accuracy:.4f}")
