In [12]:
# Machine learning pipeline for Doench Data, supplementary table 7
import pandas as pd

supp_7_path = "/Users/daveistanto/Dropbox/UIUCGraduateSchool/Researches/CROPSR_head_dir/data_files/supp_table_7.csv"
supp_7_df = pd.read_csv(supp_7_path)

In [13]:
# Extract features

import Feature_Extraction as fe

feat_vec = supp_7_df["Expanded Sequence"].apply(fe.ext_sgRNA_feat)
xy_df = supp_7_df.drop(['Sequence', 'Expanded Sequence', 'Position', "Type", 'Gene', "Transcript", "Strand"], axis = 1)
xy_df["X"] = feat_vec

In [14]:
# Split dataset to 80% Training 20% Test
from sklearn.model_selection import train_test_split 
import numpy as np

vec_train, vec_test, label_train, label_test = train_test_split(xy_df["X"], xy_df["Gene % Rank"], test_size=0.2, random_state=0)

# Convert nested np.array to 2d np.array
vec_train = np.array(list(vec_train))
vec_test = np.array(list(vec_test))

In [15]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

linear_reg = LinearRegression().fit(vec_train, label_train)
pred_train = linear_reg.predict(vec_train)
pred_test = linear_reg.predict(vec_test)

train_MSE = mean_squared_error(label_train, pred_train)
test_MSE = mean_squared_error(label_test, pred_test)

print("Train RMSE Linear Regression:", train_MSE**(1/2))
print("Test RMSE Linear Regression:", test_MSE**(1/2))

Train RMSE Linear Regression: 0.19863267830742606
Test RMSE Linear Regression: 0.2558073996529773


In [16]:
# Data loaders, training and test loader
import torch
from torch.utils.data import Dataset, DataLoader
class mouse_dataset(Dataset):
    
    # Initialize data
    def __init__(self, input_x, input_y):
        self.len = len(input_x)
        self.x_data = torch.Tensor(input_x)
        self.y_data = torch.Tensor(input_y)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
mouse_train_dataset = mouse_dataset(vec_train, label_train.values)
mouse_test_dataset = mouse_dataset(vec_test, label_test.values)

In [23]:
# Code for training test nn

def train_test_nn(model, train_loader, test_loader, loss_func, opt, epochs=10, writer=None):

    for epoch_index in range(epochs):
        
        # Train nn
        train_epoch_total_loss = 0
        train_total_batches = 0
            
        for i, data in enumerate(train_loader, 0):
            # Get training data, predict, calculate loss
            train_input, train_labels = data 
            train_pred_labels = model(train_input)
            train_batch_loss = loss_func(train_pred_labels, train_labels)
            
            # Back propagation and taking steps for parameters
            opt.zero_grad()
            train_batch_loss.backward()
            opt.step()

            # Metrics purposes
            train_epoch_total_loss += train_batch_loss.item()
            train_total_batches += 1
            
        
        # Test nn
        
        test_epoch_total_loss = 0
        test_total_batches = 0
        
        
        for i, data in enumerate(test_loader, 0):
            
            # Get test data, predict, calculate loss
            test_input, test_labels = data
            test_pred_labels = model(test_input)
            test_batch_loss = loss_func(test_pred_labels, test_labels)
            
            # Metrics purposes
            test_epoch_total_loss += test_batch_loss.item()
            test_total_batches += 1
        
    
        # Write out metrics
        
        if epoch_index % 50 == 0 or epoch_index == epochs - 1:
            print("Epoch", epoch_index, "Training Loss (RMSE):", (train_epoch_total_loss / train_total_batches)**(1/2))
            print("Epoch", epoch_index, "Test Loss (RMSE):", (test_epoch_total_loss / test_total_batches) **(1/2))
            
        writer.add_scalar("Epoch Training Batch Loss Average(RMSE):", (train_epoch_total_loss / train_total_batches)**(1/2), epoch_index)
        writer.add_scalar("Epoch Test Batch Loss Average(RMSE):", (test_epoch_total_loss / test_total_batches)**(1/2), epoch_index)



In [24]:
# Simple NN (Like logistic regression)

from torch import nn

# NN Model
class simpleNN(nn.Module):
    def __init__(self):
        super(simpleNN, self).__init__()
        
        self.lin_1 = nn.Linear(590, 1)
        self.sig = nn.Sigmoid()
        
        self.seq = nn.Sequential(
            self.lin_1,
            self.sig
        )

    def forward(self, x):
        return self.seq(x)

In [25]:
# Dataset loaders and loss function
from torch import optim
from tensorboardX import SummaryWriter

mouse_train_loader = DataLoader(dataset=mouse_train_dataset, batch_size = 32, shuffle = True, num_workers = 2)
mouse_test_loader = DataLoader(dataset=mouse_test_dataset, batch_size = 32, shuffle = True, num_workers = 2)
loss_func = nn.MSELoss()


In [26]:
# Use simple NN
simpleModel = simpleNN()
simpleModelOptimizer = optim.SGD(simpleModel.parameters(), lr = 0.001, weight_decay = 0.00001)
# simpleModelWriter = SummaryWriter("/Users/daveistanto/Dropbox/UIUCGraduateSchool/Researches/CROPSR_head_dir/CROPSR/dev_data_analysis/scripts/runs/simpleNNModel")
simpleModelWriter = SummaryWriter()

train_test_nn(simpleModel, mouse_train_loader, mouse_test_loader, loss_func, simpleModelOptimizer, epochs = 150, writer = simpleModelWriter)

Epoch 0 Training Loss (RMSE): 0.2940901394408326
Epoch 0 Test Loss (RMSE): 0.2957096410308054
Epoch 50 Training Loss (RMSE): 0.2874756686191938
Epoch 50 Test Loss (RMSE): 0.2942950363952383
Epoch 100 Training Loss (RMSE): 0.2869011274074037
Epoch 100 Test Loss (RMSE): 0.2958185688244289
Epoch 149 Training Loss (RMSE): 0.2857156073273778
Epoch 149 Test Loss (RMSE): 0.2926896397582136


In [27]:
# Complex NN

from torch import nn

## Complex NN model

class complexNN(nn.Module):
    def __init__(self):
        super(complexNN, self).__init__()
        self.lin_1 = nn.Linear(590, 300)
        self.lin_2 = nn.Linear(300, 100)
        self.lin_3 = nn.Linear(100,1)
        
        self.seq = nn.Sequential(
            self.lin_1,
            nn.ReLU(),
            self.lin_2,
            nn.ReLU(),
            nn.Dropout(p=0.3),
            self.lin_3,
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.seq(x)
        

In [28]:
# Use complex NN
complexModel = complexNN()
complexModelOptimizer = optim.SGD(complexModel.parameters(), lr = 0.001, weight_decay = 0.00001)
# complexModelWriter = SummaryWriter("/Users/daveistanto/Dropbox/UIUCGraduateSchool/Researches/CROPSR_head_dir/CROPSR/dev_data_analysis/scripts/runs/complexModelWriterExp")
complexModelWriter = SummaryWriter()

train_test_nn(complexModel, mouse_train_loader, mouse_test_loader, loss_func, complexModelOptimizer, epochs = 150, writer = complexModelWriter)

Epoch 0 Training Loss (RMSE): 0.28698250715747264
Epoch 0 Test Loss (RMSE): 0.2970339031927069
Epoch 50 Training Loss (RMSE): 0.2829659645681264
Epoch 50 Test Loss (RMSE): 0.29288827850576754
Epoch 100 Training Loss (RMSE): 0.2825786154656626
Epoch 100 Test Loss (RMSE): 0.2926971458721453
Epoch 149 Training Loss (RMSE): 0.2824152583531393
Epoch 149 Test Loss (RMSE): 0.29565691927086263
