You are tasked with developing a Neural Network (NN) model to predict the gcPBM TF-DNA
binding affinity for three TFs: Max, Mad, and Myc, using a given DNA sequence, similar to your
approach in Assignment 1. In this task, we will exclusively utilize 1-mer features. Implement
10-fold cross-validation to determine the average r-squared value. [2pt]

Hint: Your NN should include a minimum of two hidden layers equipped with an adequate
number of nodes. For the final layer, integrate a Dense(1) unit followed by a sigmoid activation
function. Opt for mean squared error (mse) as your loss function, utilize the Adam optimizer, and
apply the ‘R2Score’ metric to ascertain the r-squared value.

In [10]:
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Dense, Activation
from keras.optimizers.legacy import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold


In [17]:
def load_data(path, header=None):
    if header == None:
        df = pd.read_table(path, delimiter='\t', header = header, names = ['sequence', 'affinity'])
    else:
        df = pd.read_table(path, delimiter='\t').rename(columns={"SymmetrizedAffinity": "affinity", "Kmer":"sequence"})
        df = df.drop(columns=['idx'])
    df['affinity'] = df['affinity'].astype(float)
    return df
    
max_df = load_data('Max.txt')
mad_df = load_data('Mad.txt')
myc_df = load_data('Myc.txt')

In [27]:
def one_hot_encode_1mer(seq):
    temp = list(seq.replace('A', '1000').replace('C', '0100').replace('G', '0010').replace('T','0001'))
    return [int(x) for x in temp]

def one_hot_encode_2mer(seq):
    str = ''
    for i in range(len(seq)-1):
        str += seq[i:i+2].replace('AA', '1000000000000000').replace('AC', '0100000000000000').replace('AG', '0010000000000000').replace('AT', '0001000000000000').replace('CA', '0000100000000000').replace('CC', '0000010000000000').replace('CG', '0000001000000000').replace('CT', '0000000100000000').replace('GA', '0000000010000000').replace('GC', '0000000001000000').replace('GG', '0000000000100000').replace('GT', '0000000000010000').replace('TA', '0000000000001000').replace('TC', '0000000000000100').replace('TG', '0000000000000010').replace('TT', '0000000000000001')
    return [int(x) for x in list(str)]

In [28]:
def preprocess_data(df, encoding='1mer', normalize=True):
    if normalize == True:
        min_val = min(df['affinity'])
        max_val = max(df['affinity'])
        df['affinity'] = df['affinity'].apply(lambda x: (x - min_val)/(max_val - min_val))
    sequences = df['sequence'].to_list()
    affinities = df['affinity'].to_list()
    if encoding == '1mer':
        X = np.array([one_hot_encode_1mer(x) for x in sequences])
    else: 
        X = np.array([one_hot_encode_2mer(x) for x in sequences])
    y = np.array(affinities)
    cv = KFold(n_splits=10, shuffle=True, random_state=1)
    return X, y, cv

max_X, max_y, max_cv = preprocess_data(max_df, '1mer')
mad_X, mad_y, mad_cv = preprocess_data(mad_df, '1mer')
myc_X, myc_y, myc_cv = preprocess_data(myc_df, '1mer')

In [29]:
def train_nn_model(X, y, cv, input_dimension, learning_rate):
    r_squared_values = []
    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = None
        model = Sequential()
        model.add(Input(shape =(input_dimension)))
        model.add(Dense(500))
        model.add(Activation('sigmoid'))
        model.add(Dense(500))
        model.add(Activation('sigmoid'))
        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.compile(loss='MeanSquaredError',
             optimizer=Adam(learning_rate=0.001),
             metrics=['R2Score'])
        model.fit(X_train, y_train, batch_size=100, epochs=20, verbose=None)
        score = model.evaluate(X_test, y_test)
#         print('Total loss on testing data:', score[0])
#         print('Accuracy of testing data:', score[1])
        r_squared_values.append(score[1])
    return np.mean(r_squared_values)

with tf.device('/CPU:0'):
    print(f"Average r-squared for Max: {train_nn_model(max_X, max_y, max_cv, 144, 0.001)}")
    print(f"Average r-squared for Mad: {train_nn_model(mad_X, mad_y, mad_cv, 144, 0.001)}")
    print(f"Average r-squared for Myc: {train_nn_model(myc_X, myc_y, myc_cv, 144, 0.001)}")

Average r-squared for Max: 0.8343277990818023
Average r-squared for Mad: 0.8950340747833252
Average r-squared for Myc: 0.8558035135269165


Compare the performance of the neural network using 1-mer features against the linear
regression models that utilize both 1-mer and 2-mer encodings for Max, Mad, and Myc. Discuss
your observations on their performance. Specifically, analyze why the neural network model
with 1-mer data yields satisfactory outcomes, offering an explanation for this observation. [1pt]

Create a function capable of encoding 1-mer and 2-mer sequences, including sequences with
5-methylcytosine, denoted as ‘M’. [2pt]

Hint: For 1-mer encoding, use the following representations: A as 10000, C as 01000, G as 00100,
T as 00010, and M as 00001. For 2-mer encoding, start with AA represented as
1000000000000000000000000, proceed through combinations like AC as
0100000000000000000000000, and conclude with MM as 0000000000000000000000001.

In [46]:
def one_hot_encode_1mer(seq):
    temp = list(seq.replace('A', '10000').replace('C', '01000').replace('G', '00100').replace('T','00010').replace('M', '00001'))
    return [int(x) for x in temp]

def one_hot_encode_2mer(seq):
    encoding = ''
    encoding_map_2mer = {
    'AA': '1000000000000000000000000', 'AC': '0100000000000000000000000', 'AG': '0010000000000000000000000', 
    'AT': '0001000000000000000000000', 'AM': '0000100000000000000000000', 'CA': '0000010000000000000000000', 
    'CC': '0000001000000000000000000', 'CG': '0000000100000000000000000', 'CT': '0000000010000000000000000',
    'CM': '0000000001000000000000000', 'GA': '0000000000100000000000000', 'GC': '0000000000010000000000000', 
    'GG': '0000000000001000000000000', 'GT': '0000000000000100000000000', 'GM': '0000000000000010000000000',
    'TA': '0000000000000001000000000', 'TC': '0000000000000000100000000', 'TG': '0000000000000000010000000', 
    'TT': '0000000000000000001000000', 'TM': '0000000000000000000100000', 'MA': '0000000000000000000010000',
    'MC': '0000000000000000000001000', 'MG': '0000000000000000000000100', 'MT': '0000000000000000000000010',
    'MM': '0000000000000000000000001'
    }
    for i in range(len(seq) - 1):
        dinucleotide = seq[i:i + 2]
        encoding += encoding_map_2mer[dinucleotide]
    return [int(x) for x in list(encoding)]

Load and encode the EpiSelex-seq data for the TFs Atf4 (Atf4.txt) and Cebpb (Cebpb.txt). Apply
the neural network model you developed in Question 1 and linear regression models using
1-mer and 2-mer features to predict their binding affinity to both methylated and unmethylated
DNA sequences. Note that the binding data is unaligned. [2pt]

In [47]:
atf_df = load_data('Atf4.txt', header=1)
atf_X, atf_y, atf_cv = preprocess_data(atf_df, '1mer', normalize=False)

with tf.device('/CPU:0'):
    print(f"Average Neural Net r-squared for Atf4 1-mer: {train_nn_model(atf_X, atf_y, atf_cv, 225, 0.01)}")

ValueError: in user code:

    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/hiradh/miniconda3/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_95" is incompatible with the layer: expected shape=(None, 225), found shape=(None, 50)


In [43]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [48]:
def train_linreg_model(X, y, cv):
    # results
    mse_scores = []; r_squared_scores = []
    
    # Initialize Linear Regression model
    #model = LinearRegression()
    #model = Lasso(alpha=0.001)
    #model = Ridge(alpha=0.001)
    model = ElasticNet(alpha=0.001, l1_ratio=0.5)
    
    # Loop over each fold
    for train_index, valid_index in cv.split(X):
        # Split data
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
    
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predict on valid set
        predictions = model.predict(X_valid)
        
        # Evaluate the model
        mse = mean_squared_error(y_valid, predictions)
        mse_scores.append(mse)
        
        r_squared = model.score(X_valid, y_valid)
        r_squared_scores.append(r_squared)
        
    average_mse = np.mean(mse_scores)
    average_r_squared = np.mean(r_squared_scores)
    return average_r_squared

print("Average Linear Regression r-squared for Atf4 1-mer: " + str(train_linreg_model(atf_X, atf_y, atf_cv)))

Average Linear Regression r-squared for Atf4 1-mer: -0.00013149165587644784
