# Data Loading and Preprocessing

In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Explicitly enable eager execution
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

# Function to one-hot encode a single DNA sequence
# (Keeping this as a helper function as it's a data transformation, not a model definition)
def one_hot_encode_sequence(sequence, max_len):
    """
    Converts a DNA sequence string into a one-hot encoded numpy array.
    'A'/'a' -> [1,0,0,0], 'T'/'t' -> [0,1,0,0], 'G'/'g' -> [0,0,1,0],
    'C'/'c' -> [0,0,0,1], 'N'/'n' -> [0,0,0,0]
    Pads or truncates sequences to max_len.
    """
    mapping = {
        'A': [1, 0, 0, 0], 'a': [1, 0, 0, 0],
        'T': [0, 1, 0, 0], 't': [0, 1, 0, 0],
        'G': [0, 0, 1, 0], 'g': [0, 0, 1, 0],
        'C': [0, 0, 0, 1], 'c': [0, 0, 0, 1],
        'N': [0, 0, 0, 0], 'n': [0, 0, 0, 0]
    }
    encoded_sequence = []
    for char in sequence:
        encoded_sequence.append(mapping.get(char, [0, 0, 0, 0]))

    encoded_sequence = np.array(encoded_sequence, dtype=np.float32)

    if encoded_sequence.shape[0] < max_len:
        padding = np.zeros((max_len - encoded_sequence.shape[0], 4), dtype=np.float32)
        encoded_sequence = np.vstack((encoded_sequence, padding))
    elif encoded_sequence.shape[0] > max_len:
        encoded_sequence = encoded_sequence[:max_len, :]
    return encoded_sequence


# Load training data
train_df = pd.read_csv('dm3.kc167.tads.train.csv', header=None)
X_train_raw = train_df.iloc[:, 0].values
y_train = train_df.iloc[:, 1].values

# Determine max sequence length (assuming all sequences have the same length)
sequence_length = len(X_train_raw[0])
print(f"Detected sequence length: {sequence_length}")

# One-hot encode training sequences
X_train = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_train_raw])
print(f"Shape of preprocessed training data (N, L, 4): {X_train.shape}")

# Load testing data
test_df = pd.read_csv('dm3.kc167.tads.test.csv', header=None)
X_test_raw = test_df.iloc[:, 0].values
y_test = test_df.iloc[:, 1].values

# One-hot encode testing sequences
X_test = np.array([one_hot_encode_sequence(s, max_len=sequence_length) for s in X_test_raw])
print(f"Shape of preprocessed testing data (N, L, 4): {X_test.shape}")

# Define input shape for models
input_shape = (sequence_length, 4)

Detected sequence length: 1000
Shape of preprocessed training data (N, L, 4): (28140, 1000, 4)
Shape of preprocessed testing data (N, L, 4): (2000, 1000, 4)


# Define hyperparameters

In [38]:
NUM_KERNELS_CNN = 64
KERNEL_LENGTH_CNN = 9
LSTM_NEURONS = 30
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.001
EPOCHS = 5
BATCH_SIZE = 32

results = {} # Dictionary to store results
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
optimizer = Adam(learning_rate=LEARNING_RATE)

# Model 1 Simple CNN

In [39]:
model = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model.summary()
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=1)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Simple CNN Test Accuracy: {accuracy:.4f}")
results = {}
results["Simple CNN"] = accuracy

Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 34ms/step - accuracy: 0.6093 - loss: 0.7217 - val_accuracy: 0.0000e+00 - val_loss: 0.8957
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 34ms/step - accuracy: 0.6285 - loss: 0.6615 - val_accuracy: 0.0000e+00 - val_loss: 0.9601
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 33ms/step - accuracy: 0.6275 - loss: 0.6605 - val_accuracy: 0.0000e+00 - val_loss: 0.9742
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 33ms/step - accuracy: 0.6229 - loss: 0.6627 - val_accuracy: 0.0000e+00 - val_loss: 0.9801
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 34ms/step - accuracy: 0.6259 - loss: 0.6611 - val_accuracy: 0.0000e+00 - val_loss: 0.9772
Simple CNN Test Accuracy: 0.5000


## Model 2 Deeper CNN

In [40]:

model2 = Sequential([
    Input(shape=input_shape),
    Conv1D(filters=NUM_KERNELS_CNN, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Conv1D(filters=NUM_KERNELS_CNN * 2, kernel_size=KERNEL_LENGTH_CNN, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(DROPOUT_RATE),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(1, activation='sigmoid')
])

model2.summary()
model2.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model2.fit(X_train, y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=0)
loss, accuracy = model2.evaluate(X_test, y_test, verbose=0)
print(f"Model 2 Deeper CNN Test Accuracy: {accuracy:.4f}")
results['Model 2 Deeper CNN'] = accuracy

ValueError: Unknown variable: <Variable path=sequential_10/conv1d_12/kernel, shape=(9, 4, 64), dtype=float32, value=[[[ 0.02930487  0.06765347  0.04364122 ...  0.05204264  0.0313888
    0.04771448]
  [-0.04004342 -0.05404161 -0.08734954 ... -0.03150646  0.02175698
   -0.0144399 ]
  [ 0.03961679  0.03129893 -0.07644109 ...  0.06704194 -0.05671378
   -0.04087752]
  [-0.0284283   0.01148797  0.06357811 ...  0.05444833 -0.00906373
    0.00918768]]

 [[-0.02554039 -0.04288711 -0.09333237 ...  0.05260085 -0.09563169
   -0.0482902 ]
  [ 0.06151987  0.01923488  0.03080621 ... -0.0241861   0.05105991
   -0.01861779]
  [-0.01072972 -0.03029837  0.06827071 ...  0.03118628 -0.06415212
    0.07097419]
  [-0.0088561   0.05302993  0.02542429 ... -0.0983734  -0.04191101
   -0.04379166]]

 [[ 0.02426148  0.01556555 -0.04473355 ... -0.0627194  -0.08867868
    0.08217988]
  [ 0.04691464  0.05121904  0.08814817 ...  0.06890411  0.02657964
    0.07030732]
  [-0.04337016  0.09235466  0.07281644 ...  0.06360243  0.04944836
   -0.094935  ]
  [-0.03496634 -0.09078733  0.02830517 ...  0.09813441 -0.07053088
    0.05041773]]

 ...

 [[ 0.06163812  0.09898566 -0.07453946 ... -0.02934113  0.00234356
    0.04949545]
  [ 0.0106191  -0.04534217  0.0327365  ...  0.0301319  -0.07059259
    0.05939836]
  [ 0.06817023 -0.09359405  0.0558229  ... -0.08891159  0.00145791
   -0.06053848]
  [-0.06317663  0.04324008 -0.09155998 ... -0.05467621 -0.00158402
   -0.09721676]]

 [[-0.01179994  0.08417415  0.04606888 ... -0.0532915   0.02695974
   -0.02569488]
  [ 0.09516897 -0.03078387  0.06011011 ... -0.0284008  -0.09093057
   -0.02031378]
  [-0.01363134  0.05886819 -0.00947998 ...  0.04236946  0.03182619
    0.03979538]
  [-0.04455239  0.01303959  0.08806207 ... -0.05585295  0.04365937
   -0.08489675]]

 [[ 0.08975445  0.06280338 -0.03389359 ...  0.02012932 -0.07175521
   -0.08695627]
  [ 0.08052912 -0.01423264 -0.02741504 ...  0.00631101 -0.02642927
   -0.0060787 ]
  [-0.09104474  0.08943825  0.01317332 ... -0.02597334  0.00545996
   -0.05600432]
  [ 0.09498631  0.0870079   0.08172018 ... -0.091635   -0.05600363
   -0.02097478]]]>. This optimizer can only be called for the variables it was originally built with. When working with a new set of variables, you should recreate a new optimizer instance.