In [1]:
import numpy as np
import tensorflow as tf
import keras
from keras import models
import keras.layers as kl
from keras.layers.convolutional import Conv1D
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers import Convolution2D, MaxPooling1D
from keras.metrics import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping
from keras.callbacks import History
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc 

2023-04-11 23:21:03.145087: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
def one_hot_encoding(dna_sequence):
    """
    Converts a DNA sequence to one-hot-encoding
    :param dna_sequence: nucleotide sequence
    :type dna_sequence: str
    :return: lst of encoding for each nucleotide in the original DNA string
    :rtype: lst of lst of ints
    :author: Sydney Ballard
    """
    # Define a dictionary for 1-hot encoding
    nucleotide_dict = {'A': [1, 0, 0, 0], 
                       'C': [0, 1, 0, 0], 
                       'G': [0, 0, 1, 0], 
                       'T': [0, 0, 0, 1]}
    
    # Convert the DNA sequence to 1-hot encoding
    one_hot_encoding = [nucleotide_dict[base] for base in dna_sequence]  

    # Return list of one hot encodings
    return one_hot_encoding

def parse_sequence_activity_mpra(file_name):
    # read the contents of the text file into a list of strings
    with open(file_name, 'r') as f:
        lines = f.readlines()

    # extract each column as a list
    Dev_log2_enrichment = []
    Hk_log2_enrichment = []
    Dev_log2_enrichment_scaled = []
    Hk_log2_enrichment_scaled = []
    Dev_log2_enrichment_quantile_normalized = []
    Hk_log2_enrichment_quantile_normalized = []

    for line in lines[1:]:
        # split the line on whitespace and convert each value to a float
        values = [float(val) for val in line.split()]

        # append the values to the appropriate list
        Dev_log2_enrichment.append(values[0])
        Hk_log2_enrichment.append(values[1])
        Dev_log2_enrichment_scaled.append(values[2])
        Hk_log2_enrichment_scaled.append(values[3])
        Dev_log2_enrichment_quantile_normalized.append(values[4])
        Hk_log2_enrichment_quantile_normalized.append(values[5])

    print(len(Dev_log2_enrichment))
    print(len(Hk_log2_enrichment))
    print(len(Dev_log2_enrichment_scaled))
    print(len(Hk_log2_enrichment_scaled))
    print(len(Dev_log2_enrichment_quantile_normalized))
    print(len(Hk_log2_enrichment_quantile_normalized))


    return Dev_log2_enrichment, Hk_log2_enrichment


def parse_all(file_name, dev_values, hk_values):
    """
    Converts a FASTA file to a dictionary.
    :param file_name: The name of the input FASTA file.
    :type file_name: str
    :return: A dictionary of each key with a list of two elements: 
        the key is the original nucleotide sequence
        the first element of the list contains the one-hot-encoding of the sequence (reference one_hot_encoding function above)
        and the second element of the list contains the corresponding class names/enhancer identifier (0 or 1)
    :rtype: dict
    :author: Sydney Ballard
    :acknowledgements: Adapted from parse_fasta_file, which Kush Gulati wrote for previous assignments
    """
    # Initialize dictionary 
    # {seq: [one-hot-encoded sequence, Dev, Hk]}
    sequence_data = {} # dict will hold 0/1 for class and sequence

    sequences = []
    one_hot_encodings = []

    with open(file_name) as file:
        seq = ""

        new_line_count = 0
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                new_line_count += 1
                if seq != "":
                    sequences.append(seq)
                    one_hot_encodings.append(one_hot_encoding(seq.strip("N")))
                seq = ""
            else:
                seq += line
        sequences.append(seq)
        one_hot_encodings.append(one_hot_encoding(seq.strip("N")))


    # for idx in range(len(sequences)):
    #     sequence_data[sequences[idx]] = [one_hot_encodings[idx],
    #                                      dev_values[idx],
    #                                      hk_values[idx]]

    # return sequence_data
    return [sequences, one_hot_encodings, dev_values, hk_values]

In [8]:
dev, hk = parse_sequence_activity_mpra("/Users/sydneyballard/Desktop/Desktop - Sydney’s MacBook Pro/CS 561/cs561 repository COLLABORATIVE W KUSH/CS561/Assignment3/MPRA/Sequences_activity_Train.txt")
metadata = parse_all("/Users/sydneyballard/Desktop/Desktop - Sydney’s MacBook Pro/CS 561/cs561 repository COLLABORATIVE W KUSH/CS561/Assignment3/MPRA/Sequences_Train.fa", dev, hk)

285552
285552
285552
285552
285552
285552


In [9]:
# extract the input sequences and the output values
sequences = metadata[1]
dev_activation = metadata[2]
hk_activation = metadata[3]

# define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# fit the model to the data
model.fit(sequences, {'dev_output': dev_activation, 'hk_output': hk_activation}, epochs=100, verbose=0)

# evaluate the model on the Dev data
dev_loss = model.evaluate(sequences, dev_activation, verbose=0)
print('Dev MSE:', dev_loss)

# evaluate the model on the Hk data
hk_loss = model.evaluate(sequences, hk_activation, verbose=0)
print('Hk MSE:', hk_loss)

2023-04-11 23:29:37.662533: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {\'(<class \\\'list\\\'> containing values of types {"<class \\\'int\\\'>"})\'})'}), (<class 'dict'> containing {"<class 'str'>"} keys and {'(<class \'list\'> containing values of types {"<class \'float\'>"})'} values)