In [1]:
import sys
sys.executable

'C:\\ProgramData\\Anaconda3\\python.exe'

In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict


# Models

In [3]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [4]:
class SimpleCNN(BasicModule):
    def __init__(self,args):
        super(SimpleCNN, self).__init__()
        # output size of fc layers
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        self.params = args
        self.k = args['k_max']
        
        # self.k = args['k'] # for k max pooling
        self.kernel_num = args['kernel_num']
        self.dropout = nn.Dropout(args['dropout'])
 
        
        # convolution layers
        filter_sizes = [3, 5, 7, 9]
        self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True, padding=0)
        self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True, padding=0)
        self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True, padding=0)
        self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True, padding=0)
#         self.conv5 = nn.Conv2d(1, self.kernel_num[4], (11, 4), bias=True, padding=0)

        
        # fc layers
        self.fc1 = nn.Linear((sum([self.kernel_num[i] * (31-filter_sizes[i]) // 2 for i in range(len(filter_sizes))])), out1, bias=True)
        self.fc2 = nn.Linear(out1, out2, bias=True)
        self.fc3 = nn.Linear(out2, out3, bias=False)
        

    def _conv_and_pool(self, x, conv):
        # x: (batch, 1, size, 4)
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x)
        # x: (batch, kernel_num, H_out)
        x = F.max_pool2d(x, kernel_size=(2,1), stride=2)
        x = x.view(x.size(0), x.size(1) * x.size(2))
        #  (batch, kernel_num * k)
        return x
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch, size)
        # x: (batch, sentence_length, embed_dim)
        x = x.view(x.size(0),1, 30, 4)
        x1 = self._conv_and_pool(x, self.conv1) 
        x2 = self._conv_and_pool(x, self.conv2)  
        x3 = self._conv_and_pool(x, self.conv3)  
        x4 = self._conv_and_pool(x, self.conv4)  

        
        x = torch.cat((x1, x2, x3, x4), 1) 
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        return self.fc3(x)

In [5]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.FloatTensor(self.x)
        self.y = torch.FloatTensor(self.y)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [6]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

# Training


In [7]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)
endo = load_data("../dataset/table3")
endo_dataset = DNADataset(endo,30)

In [8]:
test_1a_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[1])[1]
test_1a = DNADataset([(test_1a_df[test_1a_df.columns[0]].values[i], float(test_1a_df[test_1a_df.columns[1]].values[i])) for i in range(len(test_1a_df)) ], 30)

In [9]:
test_1b_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[2])[2]
test_1b = DNADataset([(test_1b_df[test_1b_df.columns[0]].values[i], float(test_1b_df[test_1b_df.columns[1]].values[i])) for i in range(len(test_1b_df)) ], 30)

In [20]:
def train(model, train_dataset, epochs, batch_size, val_set, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=0)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()


    print("Train %s samples."%n_training_samples)
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for inp, scores in train_loader:
            inp = inp.unsqueeze(1)
            inp = Variable(inp)
            out = model(inp).view((-1))

            scores = Variable(torch.FloatTensor(scores))
            loss = criterion(out, scores)
            epoch_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
#         model.setDropout(0.0)
#         print(e)
#         tscores = test_scores(model, val_set, 400)
#         corr = pd.DataFrame([[val_set[i][0], float(val_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
#     [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
#         print(corr)
#         if corr >= 0.68:
#             print("Saving.")
#             model.save("../model/cnn_3579_90-60-40-110_80-60_90_%s.model"%(round(corr, 3)))

In [21]:
args = {
    'kernel_num': [90, 60, 40, 110],
    'dropout': 0.15,
    'fc1_out': 80,
    'fc2_out': 40,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 1
}
model = SimpleCNN(args)
# model.float()
print(model.fc1.weight.shape)
train(model, train_set, 5, 70, endo_dataset, cv=0.0, learning_rate=0.003, start=0, end=12832)

torch.Size([80, 3730])
Train 12832.0 samples.


In [22]:
model(train_set.x[:120].view(120,30,4))

tensor([[10.8422],
        [69.2310],
        [46.4673],
        [51.6845],
        [36.2052],
        [13.7545],
        [31.0061],
        [25.9110],
        [55.5216],
        [38.4801],
        [24.2505],
        [18.2054],
        [26.4022],
        [55.7918],
        [26.8448],
        [24.4465],
        [41.9331],
        [40.7746],
        [52.3995],
        [40.0110],
        [40.5997],
        [44.8359],
        [24.8150],
        [24.3695],
        [29.9257],
        [56.5598],
        [28.7523],
        [ 0.4242],
        [49.2482],
        [29.2895],
        [ 7.4175],
        [15.9620],
        [42.8520],
        [58.3679],
        [32.2380],
        [41.6571],
        [33.4925],
        [26.6476],
        [25.4766],
        [41.6155],
        [29.5063],
        [15.1500],
        [36.5635],
        [49.4341],
        [66.8345],
        [28.3805],
        [19.9329],
        [61.4743],
        [55.1722],
        [ 0.5174],
        [38.9662],
        [28.2399],
        [29.

In [88]:
train(model, train_set, 10, 90, endo_dataset, cv=0.0, start=0, end=len(train_set))

Train 12832.0 samples.
0
0.6574728560188827
1
0.6520283241542093
2
0.6569819040125885
3
0.6614885916601102
4
0.6619417781274587
5
0.6430653029110937
6
0.6542061369000787
7
0.6616207710464201
8
0.668053501180173
9
0.6567049567269866


# Correlation

In [12]:
def endo_scores(dataset):
    escores = []
    for inp, scores in DataLoader(dataset, batch_size=100):
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).view((-1))
        for v in out.detach().numpy():
            escores.append(v)
    return escores

In [49]:
escores = endo_scores(test_set)

NameError: name 'endo_scores' is not defined

In [50]:
combined_df = pd.DataFrame([[endo[i][0], endo[i][1], escores[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'] )

In [51]:
combined_df[['endogenous', 'integrated']].corr(method='spearman')

Unnamed: 0,endogenous,integrated
endogenous,1.0,0.655792
integrated,0.655792,1.0


## Test

In [13]:
#
def test_scores(model, test_set, batch_size):
    tscores = []
    for inp, scores in DataLoader(test_set, batch_size=batch_size):
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).view((-1))
        for v in out.detach().numpy():
            tscores.append(v)
    return tscores

In [14]:
tscores = test_scores(model, test_set, 320)
combined_df = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'] )

In [15]:
combined_df [['endogenous', 'integrated']].corr(method="spearman")

Unnamed: 0,endogenous,integrated
endogenous,1.0,0.662335
integrated,0.662335,1.0


In [138]:
tscores = test_scores(model, test_set, 320)
combined_df = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'] )

In [139]:
combined_df [['endogenous', 'integrated']].corr(method='spearman')

Unnamed: 0,endogenous,integrated
endogenous,1.0,0.770316
integrated,0.770316,1.0
