# Transcription factor classifier

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from itertools import product
from nn import nn, io, preprocess

## Data

In [2]:
rap1 = io.read_text_file("./data/rap1-lieb-positives.txt")
yeast = io.read_fasta_file("./data/yeast-upstream-1k-negative.fa")

print("Length of Rap1 positives: " + str(len(rap1)))
print("Length of Yeast negatives: " + str(len(yeast)))

Length of Rap1 positives: 137
Length of Yeast negatives: 3163


In [3]:
pos_seq = rap1

# Break up yeast_neg into sizes match the length of rap1 sequences
seq_len = len(rap1[0])
neg_seq = []

for seq in yeast:
    seq_sub = [seq[i:i+seq_len] for i in range(0, len(seq), seq_len)]
    # Keep only sequences that are exactly rap length long
    seq_sub = [x for x in seq_sub if len(x) == seq_len]
    neg_seq += seq_sub

# Combine all sequences and get labes
seqs = pos_seq + neg_seq
labels = [True] * len(pos_seq) + [False] * len(neg_seq)

print("Length of positives: " + str(len(pos_seq)))
print("Length of negatives: " + str(len(neg_seq)))
print("Total sequences: " + str(len(pos_seq) + len(neg_seq)))

Length of positives: 137
Length of negatives: 183297
Total sequences: 183434


In [4]:
# Up sample the positive class
seqs2, labels2 = preprocess.sample_seqs(seqs, labels)
print("Length of positives: " + str(sum(labels2)))
print("Length of negatives: " + str(len(seqs2) - sum(labels2)))
print("Total sequences: " + str(len(seqs2)))

Length of positives: 183297
Length of negatives: 183297
Total sequences: 366594


In [5]:
# Encode sequences and create a training and testing split
X = preprocess.one_hot_encode_seqs(seqs2)
y = np.array(labels2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1)
print("Training data " + str(X_train.shape))
print("Testing data " + str(X_val.shape))

Training data (256615, 68)
Testing data (109979, 68)


In [7]:
# Create network
layers = [{"input_dim": 68, "output_dim": 40, "activation": "sigmoid"},
          {"input_dim": 40, "output_dim": 20, "activation": "sigmoid"},
          {"input_dim": 20, "output_dim": 1, "activation": "sigmoid"}]
net = nn.NeuralNetwork(layers, lr = 0.0001, seed = 1, batch_size = 20, epochs = 1, loss_function = "bce")

train_loss, val_loss = net.fit(X_train, y_train, X_val, y_val)

Forward:
Layer index: 1
Shape _W :(40, 68)
Shape _b :(40, 1)
Shape _A_prev: (20, 68)
Shape Z_curr :(20, 40)
Shape A_curr: (20, 40)
Layer index: 2
Shape _W :(20, 40)
Shape _b :(20, 1)
Shape _A_prev: (20, 40)
Shape Z_curr :(20, 20)
Shape A_curr: (20, 20)
Layer index: 3
Shape _W :(1, 20)
Shape _b :(1, 1)
Shape _A_prev: (20, 20)
Shape Z_curr :(20, 1)
Shape A_curr: (20, 1)
Backprop
Shape _y_batch (y): (20,)
Shape output (y_hat): (20, 1)
_BCE_BP dA shape: (20, 20)
Layer index: 3
Shape _W_curr :(1, 20)
Shape _b_curr :(1, 1)
Shape _Z_curr: (20, 1)
Shape _A_prev:(20, 20)
Shape _dA: (20, 20)
Shape bp: (20, 20)


ValueError: shapes (20,20) and (1,20) not aligned: 20 (dim 1) != 1 (dim 0)

In [10]:
for idx in range(0,4):
    try:
        print("A" + str(idx) + str(net.cache["A"+str(idx)].shape))
        print("Z" + str(idx) +str(net.cache["Z"+str(idx)].shape))
    except: pass

A0(20, 68)
A1(20, 40)
Z1(20, 40)
A2(20, 20)
Z2(20, 20)
A3(20, 1)
Z3(20, 1)


In [11]:
for idx in range(0,4):
    try:
        print("W" + str(idx) + str(net._param_dict["W"+str(idx)].shape))
        print("b" + str(idx) +str(net._param_dict["b"+str(idx)].shape))
    except: pass

W1(40, 68)
b1(40, 1)
W2(20, 40)
b2(20, 1)
W3(1, 20)
b3(1, 1)
