In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input, Reshape, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# --- Data Loading and Preprocessing ---

print("--- Data Loading and Preprocessing ---")

# Function to one-hot encode a single DNA sequence
def one_hot_encode_sequence(sequence, max_len):
    """
    Converts a DNA sequence string into a one-hot encoded numpy array.
    'A'/'a' -> [1,0,0,0], 'T'/'t' -> [0,1,0,0], 'G'/'g' -> [0,0,1,0],
    'C'/'c' -> [0,0,0,1], 'N'/'n' -> [0,0,0,0]
    Pads or truncates sequences to max_len.
    """
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded_sequence = []
    for char in sequence:
        encoded_sequence.append(mapping.get(char, [0, 0, 0, 0])) # Default to N if char not found

    encoded_sequence = np.array(encoded_sequence, dtype=np.float32)

    # Pad or truncate sequence to max_len
    if encoded_sequence.shape[0] < max_len:
        padding = np.zeros((max_len - encoded_sequence.shape[0], 4), dtype=np.float32)
        encoded_sequence = np.vstack((encoded_sequence, padding))
    elif encoded_sequence.shape[0] > max_len:
        encoded_sequence = encoded_sequence[:max_len, :]
    return encoded_sequence


# Load training data
train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw = train_df.iloc[:, 0].values
y_train = train_df.iloc[:, 1].values

# Determine max sequence length (assuming all sequences have the same length as per problem)
sequence_length = len(X_train_raw[0])
print(f"Detected sequence length: {sequence_length}")

# One-hot encode training sequences
X_train = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_train_raw])
print(f"Shape of preprocessed training data (N, L, 4): {X_train.shape}")

# Load testing data
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw = test_df.iloc[:, 0].values
y_test = test_df.iloc[:, 1].values

# One-hot encode testing sequences
X_test = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_test_raw])
print(f"Shape of preprocessed testing data (N, L, 4): {X_test.shape}")

# Define input shape for models
input_shape = (sequence_length, 4)

# --- Model Architectures ---

# Common Hyperparameters
NUM_KERNELS_CNN = 64
KERNEL_LENGTH_CNN = 9
LSTM_NEURONS = 30
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.001
EPOCHS = 50
BATCH_SIZE = 32

# Model 1: Simple CNN Model
def build_simple_cnn(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(DROPOUT_RATE),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 2: Deeper CNN Model (Two Conv1D blocks)
def build_deeper_cnn(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(DROPOUT_RATE),
        Conv1D(filters=NUM_KERNELS_CNN * 2, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(DROPOUT_RATE),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 3: Simple LSTM Model
def build_simple_lstm(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(LSTM_NEURONS),
        Dropout(DROPOUT_RATE),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 4: Deeper LSTM Model (Two Stacked LSTM layers)
def build_deeper_lstm(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(LSTM_NEURONS, return_sequences=True),
        Dropout(DROPOUT_RATE),
        LSTM(LSTM_NEURONS),
        Dropout(DROPOUT_RATE),
        Dense(128, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 5: CNN-LSTM Hybrid Model
def build_cnn_lstm_hybrid(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(DROPOUT_RATE),
        LSTM(LSTM_NEURONS),
        Dropout(DROPOUT_RATE),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 6: CNN with More Dense Layers
def build_cnn_more_dense(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(DROPOUT_RATE),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# Model 7: LSTM with More Dense Layers
def build_lstm_more_dense(input_shape):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(LSTM_NEURONS),
        Dropout(DROPOUT_RATE),
        Dense(128, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(64, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(1, activation='sigmoid')
    ])
    return model

# --- Training and Evaluation Loop ---

print("\n--- Training and Evaluating Models ---")

models_to_train = [
    ("Simple CNN", build_simple_cnn),
    ("Deeper CNN", build_deeper_cnn),
    ("Simple LSTM", build_simple_lstm),
    ("Deeper LSTM", build_deeper_lstm),
    ("CNN-LSTM Hybrid", build_cnn_lstm_hybrid),
    ("CNN with More Dense Layers", build_cnn_more_dense),
    ("LSTM with More Dense Layers", build_lstm_more_dense),
]

results = {}

# Define Early Stopping callback (re-used for all models)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

for name, builder_func in models_to_train:
    print(f"\n--- Training {name} ---")

    # Build the model
    model = builder_func(input_shape)
    model.summary() # Print model summary for each model

    # Compile the model
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        validation_split=0.2, # 20% of training data for validation
                        callbacks=[early_stopping],
                        verbose=0) # Set to 1 for progress bar

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"{name} Test Accuracy: {accuracy:.4f}")
    results[name] = accuracy

print("\n--- Summary of Model Accuracies on Test Set ---")
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.4f}")

# Find the best accuracy
best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")



--- Data Loading and Preprocessing ---
Detected sequence length: 1000
Shape of preprocessed training data (N, L, 4): (28140, 1000, 4)
Shape of preprocessed testing data (N, L, 4): (2000, 1000, 4)

--- Training and Evaluating Models ---

--- Training Simple CNN ---


KeyboardInterrupt: 