In [2]:
import sys
sys.executable

'/usr/bin/python3'

In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict


In [2]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [65]:
# -*- coding: utf-8 -*-


class SimpleCNN(BasicModule):
    def __init__(self,args):
        super(SimpleCNN, self).__init__()
        # output size of fc layers
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        self.params = args
        self.k = args['k_max']
        
        # self.k = args['k'] # for k max pooling
        self.kernel_num = args['kernel_num']
        self.dropout = nn.Dropout(args['dropout'])
 
        
        # convolution layers
        self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True)
        self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True)
        self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True)
        self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True)
#         self.conv5 = nn.Conv2d(1, self.kernel_num[4], (11, 4), bias=True)
#         self.conv6 = nn.Conv2d(1, self.kernel_num[5], (13, 4), bias=True)
        
        # fc layers
        self.fc1 = nn.Linear((sum(self.kernel_num))*self.k, out1, bias=True)
        self.fc2 = nn.Linear(out1, out2, bias=True)
        self.fc3 = nn.Linear(out2, out3, bias=True)
        
        # init
#         init.normal_(self.conv1.weight)
#         init.normal_(self.conv2.weight)
#         init.normal_(self.conv3.weight)
#         init.normal_(self.conv1.bias)
#         init.normal_(self.conv2.bias)
#         init.normal_(self.conv3.bias)
        
#         init.normal_(self.fc1.weight)
#         init.normal_(self.fc2.weight)
#         init.normal_(self.fc3.weight)

    def _conv_and_pool(self, x, conv):
        # x: (batch, 1, size, 4)
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x.squeeze(3))
        # x: (batch, kernel_num, H_out)
#         x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = self.kmax_pooling(x, 2, k=self.k)
        x = x.view(x.size(0), x.size(1) * x.size(2))
        #  (batch, kernel_num * k)
        return x
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        # x: (batch, size)
        # x: (batch, sentence_length, embed_dim)
        x = x.view(x.size(0),1, 30, 4)
        x1 = self._conv_and_pool(x, self.conv1)  # (batch, kernel_num * k)
        x2 = self._conv_and_pool(x, self.conv2)  # (batch, kernel_num * k)
        x3 = self._conv_and_pool(x, self.conv3)  # (batch, kernel_num * k)
        x4 = self._conv_and_pool(x, self.conv4)  # (batch, kernel_num * k)
#         x5 = self._conv_and_pool(x, self.conv5)  # (batch, kernel_num * k)
#         x6 = self._conv_and_pool(x, self.conv6)  # (batch, kernel_num * k)
        

        
        x = torch.cat((x1, x2, x3, x4), 1)  # (batch, 6 * kernel_num * k)
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
#         x = 
#         logit = F.log_softmax(x, dim=1)
        return self.fc3(x)
    
    def kmax_pooling(self, x, dim, k):
        index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
        return x.gather(dim, index)

In [22]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.FloatTensor(self.x)
        self.y = torch.FloatTensor(self.y)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [5]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [23]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)


In [54]:
def train(model, train_dataset, epochs, batch_size, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=3)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    model.float()

    print("Train %s samples."%n_training_samples)
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for inp, scores in train_loader:
            inp = inp.unsqueeze(1)
            inp = Variable(inp)
            out = model(inp).view((-1))

            scores = Variable(torch.FloatTensor(scores))
            loss = criterion(out, scores)
            epoch_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
        model.setDropout(0.0)
        print(e)
        corr = pd.DataFrame([[endo[i][0], endo[i][1], endo_scores(endo_dataset)[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
        print(corr)
        if corr >= 0.65:
            model.save("../model/cnn_%s.t")
            print(model.params)

In [43]:
a = 0.333234
round(a,3)

0.333

In [71]:
args = {
    'kernel_num': [100, 60, 60, 100],
    'dropout': 0.5,
    'fc1_out': 80,
    'fc2_out': 60,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 5
}
model = SimpleCNN(args)
train(model, train_set, 30, 120, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
0.44866089693155
1
0.42365381589299766
2
0.46391817466561763
3
0.5055609756097561
4
0.5460896931549961
5
0.5954807238394965
6
0.612059795436664
7
0.6328623131392604
8
0.6376774193548387
9
0.6235216365066877
10
0.6368717545239968
11
0.6381306058221873
12
0.6450983477576712
13
0.6383802334866022
14
0.6468040912667191
15
0.6475719905586153
16
0.63960346184107
17
0.65160448718332
{'kernel_num': [100, 60, 60, 100], 'dropout': 0.5, 'fc1_out': 80, 'fc2_out': 60, 'fc3_out': 1, 'input_max_length': 30, 'k_max': 5}
18
0.6360198835883304
19
0.643648534459545
20
0.6293375295043273
21
0.6320167301608614
22
0.637639653815893
23
0.6381725226955544
24
0.6398971516878916
25
0.6203282774646192
26
0.6209976396538159
27
0.6253973249409913
28
0.6348512981904013
29
0.6245161290322581


In [72]:
train(model, train_set, 15, 160, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
0.6412651455546814
1
0.6378158929976396
2
0.6358457907159717
3
0.6421715184893785
4
0.6295263571990558
5
0.6285192761605035
6
0.6233516915814319
7
0.6124626278520849
8
0.6139669551534225
9
0.6206766325727774
10
0.6261589299763965
11
0.6256428009441385
12
0.6198898505114083
13
0.6127584579071598
14
0.6140802517702596


In [496]:
train(model, train_set, 10, 100, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous      1.0000      0.6534
integrated      0.6534      1.0000
1
            endogenous  integrated
endogenous    1.000000    0.648365
integrated    0.648365    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.646414
integrated    0.646414    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.664277
integrated    0.664277    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.669432
integrated    0.669432    1.000000
5
            endogenous  integrated
endogenous     1.00000     0.67837
integrated     0.67837     1.00000
6
            endogenous  integrated
endogenous    1.000000    0.656076
integrated    0.656076    1.000000
7
            endogenous  integrated
endogenous     1.00000     0.65595
integrated     0.65595     1.00000
8
            endogenous  integrated
endogenous    1.000000    0.655068
integrated    0.655068    1.000000
9
            

In [480]:
train(model, train_set, 10, 100, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.612601
integrated    0.612601    1.000000
1
            endogenous  integrated
endogenous    1.000000    0.612393
integrated    0.612393    1.000000
2
            endogenous  integrated
endogenous     1.00000     0.60788
integrated     0.60788     1.00000
3
            endogenous  integrated
endogenous    1.000000    0.609152
integrated    0.609152    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.614974
integrated    0.614974    1.000000
5
            endogenous  integrated
endogenous    1.000000    0.618713
integrated    0.618713    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.615899
integrated    0.615899    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.603946
integrated    0.603946    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.611355
integrated    0.611355    1.000000
9
            

# Correlation

In [25]:
import pandas as pd

In [26]:
endo = load_data("../dataset/table3")



In [27]:
endo_dataset = DNADataset(endo,30)


In [28]:
def endo_scores(dataset):
    escores = []
    for inp, scores in DataLoader(dataset, batch_size=100):
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).view((-1))
        for v in out.detach().numpy():
            escores.append(v)
    return escores

In [29]:
escores = endo_scores(endo_dataset)

In [45]:
combined_df = pd.DataFrame([[endo[i][0], endo[i][1], escores[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'] )

In [52]:
combined_df[['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']

0.4102470495672699