# Data Loading and Preprocessing

In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


print("--- Data Loading and Preprocessing ---")

# Function to one-hot encode a single DNA sequence
# (Keeping this as a helper function as it's a data transformation, not a model definition)
def one_hot_encode_sequence(sequence, max_len):
    """
    Converts a DNA sequence string into a one-hot encoded numpy array.
    'A'/'a' -> [1,0,0,0], 'T'/'t' -> [0,1,0,0], 'G'/'g' -> [0,0,1,0],
    'C'/'c' -> [0,0,0,1], 'N'/'n' -> [0,0,0,0]
    Pads or truncates sequences to max_len.
    """
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded_sequence = []
    for char in sequence:
        encoded_sequence.append(mapping.get(char, [0, 0, 0, 0]))

    encoded_sequence = np.array(encoded_sequence, dtype=np.float32)

    if encoded_sequence.shape[0] < max_len:
        padding = np.zeros((max_len - encoded_sequence.shape[0], 4), dtype=np.float32)
        encoded_sequence = np.vstack((encoded_sequence, padding))
    elif encoded_sequence.shape[0] > max_len:
        encoded_sequence = encoded_sequence[:max_len, :]
    return encoded_sequence


# Load training data
train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw = train_df.iloc[:, 0].values
y_train = train_df.iloc[:, 1].values

# Determine max sequence length (assuming all sequences have the same length)
sequence_length = len(X_train_raw[0])
print("Detected sequence length:", sequence_length)

# One-hot encode training sequences
X_train = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_train_raw])
print("Shape of preprocessed training data (N, L, 4):", X_train.shape)

# Load testing data
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw = test_df.iloc[:, 0].values
y_test = test_df.iloc[:, 1].values

# One-hot encode testing sequences
X_test = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_test_raw])
print("Shape of preprocessed testing data (N, L, 4):", X_test.shape)

# Define input shape for models
input_shape = (sequence_length, 4)

--- Data Loading and Preprocessing ---
Detected sequence length: 1000
Shape of preprocessed training data (N, L, 4): (28140, 1000, 4)
Shape of preprocessed testing data (N, L, 4): (2000, 1000, 4)


# Define hyperparameters

In [21]:
NUM_KERNELS_CNN = 64
KERNEL_LENGTH_CNN = 9
LSTM_NEURONS = 30
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.001
EPOCHS = 1
BATCH_SIZE = 32

results = {} # Dictionary to store results
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# Optimizer will now be defined *inside* each model block

# Model 1 Simple CNN

In [22]:
print("\n--- Training Model 1: Simple CNN ---")

model = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 1: Simple CNN Test Accuracy:", accuracy)
results["Model 1: Simple CNN"] = accuracy


--- Training Model 1: Simple CNN ---


Model 1: Simple CNN Test Accuracy: 0.5


## Model 2 Deeper CNN

In [23]:
print("\n--- Training Model 2: Deeper CNN ---")

model = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Conv1D(filters=NUM_KERNELS_CNN * 2, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 2: Deeper CNN Test Accuracy:", accuracy)
results["Model 2: Deeper CNN"] = accuracy


--- Training Model 2: Deeper CNN ---


KeyboardInterrupt: 

## Model 3: Simple LSTM

In [None]:
print("\n--- Training Model 3: Simple LSTM ---")

model = Sequential([
    Input(shape=input_shape),
    LSTM(LSTM_NEURONS),
    Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 3: Simple LSTM Test Accuracy:", accuracy)
results["Model 3: Simple LSTM"] = accuracy



--- Training Model 3: Simple LSTM ---


Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x1755ab390>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/opt/anaconda3/envs/ml_env/lib/python3.11/site-packages/keras/src/backend/tensorflow/rnn.py", line 418, in <genexpr>
    output_ta_t = tuple(  File "/opt/anaconda3/envs/ml_env/lib/python3.11/site-packages/tensorflow/python/util/tf_should_use.py", line 288, in wrapped


KeyboardInterrupt: 

## Model 4: Deeper LSTM

In [None]:
print("\n--- Training Model 4: Deeper LSTM ---")

model = Sequential([
    Input(shape=input_shape),
    LSTM(LSTM_NEURONS, return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_NEURONS),
    Dropout(DROPOUT_RATE),
    Dense(128, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 4: Deeper LSTM Test Accuracy:", accuracy)
results["Model 4: Deeper LSTM"] = accuracy


--- Training Model 4: Deeper LSTM ---


KeyboardInterrupt: 

## Model 5: CNN-LSTM Hybrid

In [None]:
print("\n--- Training Model 5: CNN-LSTM Hybrid ---")

model = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_NEURONS),
    Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 5: CNN-LSTM Hybrid Test Accuracy:", accuracy)
results["Model 5: CNN-LSTM Hybrid"] = accuracy

## Model 6: CNN with More Dense Layers


In [None]:
model = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 6: CNN with More Dense Layers Test Accuracy:", accuracy)
results["Model 6: CNN with More Dense Layers"] = accuracy

## Model 7: LSTM with More Dense Layers


In [None]:
print("\n--- Training Model 7: LSTM with More Dense Layers ---")

model = Sequential([
    Input(shape=input_shape),
    LSTM(LSTM_NEURONS),
    Dropout(DROPOUT_RATE),
    Dense(128, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
optimizer = Adam(learning_rate=LEARNING_RATE) # New optimizer instance
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model 7: LSTM with More Dense Layers Test Accuracy:", accuracy)
results["Model 7: LSTM with More Dense Layers"] = accuracy

## Summary of results


In [None]:
print("\n--- Summary of Model Accuracies on Test Set ---")
for name, accuracy in results.items():
    print(name, "Accuracy:", accuracy)

# Find the best accuracy
best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]
print("\nBest Model:", best_model_name, "with Accuracy:", best_accuracy)