In [1]:
import sys
sys.executable

'/usr/bin/python3'

In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict


In [2]:


class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [42]:
class DPCNN(BasicModule):
    def __init__(self,args):
        super(DPCNN, self).__init__()
        self.max_len = args['input_max_length']
        dropout = args['dropout']
        self.params = args
        
        self.kn_num = args['kernel_num']
        self.dropout = nn.Dropout(dropout)
        self.nl = 3
        
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        
        # capture 3-gram context info
        self.conv_region_embedding = nn.Conv2d(1, self.kn_num, (3, 4), stride=1, bias=True)
        # used in block conv
        self.conv3 = nn.Conv2d(self.kn_num, self.kn_num, (3, 1), stride=1, bias=True)
        self.pooling = nn.MaxPool2d(kernel_size=(3, 1), stride=2)
        self.padding_conv = nn.ZeroPad2d((0, 0, 1, 1))
        self.padding_pool = nn.ZeroPad2d((0, 0, 0, 1))
        # fc layers
        self.fc1 = nn.Linear(self.nl*self.kn_num, out3, bias=True)
#         self.fc2 = nn.Linear(out1, out2, bias=True)
#         self.fc3 = nn.Linear(out2, out3, bias=True)
        
        
    def forward(self, x):
        batch = x.size(0)
        # print('------------\n', x.shape)
        x = x.view(batch, 1, self.max_len, 4)
        # Region embedding
        x = self.conv_region_embedding(x)   
        # [batch_size, channel_size, text_length-2, 1]
        x = self.padding_conv(x)
        # [batch_size, channel_size, text_length, 1]
        x = F.relu(x)
        x = self.conv3(x)
        # [batch_size, channel_size, text_length-2, 1]
        x = self.padding_conv(x)
        # [batch_size, channel_size, text_length, 1]
        x = F.relu(x)
        x = self.conv3(x)
        # [batch_size, channel_size, text_length-2, 1]

        while x.size()[-2] > self.nl:
            x = self._block(x)

        x = x.view(batch, self.nl*self.kn_num)
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)

        return self.fc3(x)
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)
        

    def _block(self, x):
        # Pooling
        # [batch_size, channel_size, text_length-2, 1]
        # [batch_size, channel_size, text_length-1, 1]

        x = self.padding_pool(x)
#         print(x.shape)
        px = self.pooling(x)
        # [batch_size, channel_size, text_length-1-2 / 2 + 1, 1]
#         print(px.size(2))

        # Convolution
        x = self.padding_conv(px)
        x = F.relu(x)
        x = self.conv3(x)

        x = self.padding_conv(x)
        x = F.relu(x)
        x = self.conv3(x)
#         print(x.shape)

        # Short Cut
        x = x + px

        return x



In [60]:
ts = train_set[2][0]
args = args = {
    'kernel_num': 75,
    'dropout': 0.5,
    'fc1_out': 80,
    'fc2_out': 60,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 5
}
model = DPCNN(args)

In [61]:
train(model, train_set, 20, 160, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.360623
integrated    0.360623    1.000000
1
            endogenous  integrated
endogenous    1.000000    0.370656
integrated    0.370656    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.374936
integrated    0.374936    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.375799
integrated    0.375799    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.389923
integrated    0.389923    1.000000
5
            endogenous  integrated
endogenous    1.000000    0.428217
integrated    0.428217    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.434348
integrated    0.434348    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.448774
integrated    0.448774    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.471748
integrated    0.471748    1.000000
9
            

In [59]:
train(model, train_set, 10, 160, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.525205
integrated    0.525205    1.000000
1
            endogenous  integrated
endogenous     1.00000     0.54259
integrated     0.54259     1.00000
2
            endogenous  integrated
endogenous    1.000000    0.548696
integrated    0.548696    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.559937
integrated    0.559937    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.550848
integrated    0.550848    1.000000
5
            endogenous  integrated
endogenous    1.000000    0.550716
integrated    0.550716    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.547651
integrated    0.547651    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.552516
integrated    0.552516    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.552818
integrated    0.552818    1.000000
9
            

KeyboardInterrupt: 

In [4]:

# class SimpleCNN(BasicModule):
#     def __init__(self,args):
#         super(SimpleCNN, self).__init__()
#         # output size of fc layers
#         out1 = args['fc1_out']
#         out2 = args['fc2_out']
#         out3 = args['fc3_out'] # output layer
#         self.params = args
#         self.k = args['k_max']
        
#         # self.k = args['k'] # for k max pooling
#         self.kernel_num = args['kernel_num']
#         self.dropout = nn.Dropout(args['dropout'])
 
        
#         # convolution layers
#         self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True)
#         self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True)
#         self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True)
#         self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True)
#         self.conv5 = nn.Conv2d(1, 100, (11, 4), bias=True)
        
#         # fc layers
#         self.fc1 = nn.Linear((sum(self.kernel_num)+100)*self.k, out1, bias=True)
#         self.fc2 = nn.Linear(out1, out2, bias=True)
#         self.fc3 = nn.Linear(out2, out3, bias=True)
        
#         # init
# #         init.normal_(self.conv1.weight)
# #         init.normal_(self.conv2.weight)
# #         init.normal_(self.conv3.weight)
# #         init.normal_(self.conv1.bias)
# #         init.normal_(self.conv2.bias)
# #         init.normal_(self.conv3.bias)
        
# #         init.normal_(self.fc1.weight)
# #         init.normal_(self.fc2.weight)
# #         init.normal_(self.fc3.weight)

#     def _conv_and_pool(self, x, conv):
#         # x: (batch, 1, size, 4)
#         x = conv(x)
#         # x: (batch, kernel_num, H_out, 1)
#         x = F.relu(x.squeeze(3))
#         # x: (batch, kernel_num, H_out)
# #         x = F.max_pool1d(x, x.size(2)).squeeze(2)
#         x = self.kmax_pooling(x, 2, k=self.k)
#         x = x.view(x.size(0), x.size(1) * x.size(2))
#         #  (batch, kernel_num * k)
#         return x
    
#     def setDropout(self, dropout):
#         self.dropout = nn.Dropout(dropout)


#     def forward(self, x):
#         # x: (batch, size)
#         # x: (batch, sentence_length, embed_dim)
#         x = x.view(x.size(0),1, 30, 4)
#         x1 = self._conv_and_pool(x, self.conv1)  # (batch, kernel_num * k)
#         x2 = self._conv_and_pool(x, self.conv2)  # (batch, kernel_num * k)
#         x3 = self._conv_and_pool(x, self.conv3)  # (batch, kernel_num * k)
#         x4 = self._conv_and_pool(x, self.conv4)  # (batch, kernel_num * k)
#         x5 = self._conv_and_pool(x, self.conv5)  # (batch, kernel_num * k)

        
#         x = torch.cat((x1, x2, x3, x4, x5), 1)  # (batch, 4 * kernel_num * k)
#         x = self.dropout(x)
#         x = self.fc1(x)
#         x = F.relu(x)
#         x = self.dropout(x)
#         x = self.fc2(x)
#         x = F.relu(x)
#         x = self.dropout(x)
# #         x = 
# #         logit = F.log_softmax(x, dim=1)
#         return self.fc3(x)
    
#     def kmax_pooling(self, x, dim, k):
#         index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
#         return x.gather(dim, index)

In [5]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.DoubleTensor(self.x)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [6]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [7]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)


In [37]:
def train(model, train_dataset, epochs, batch_size, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=3)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    model.double()

    print("Train %s samples."%n_training_samples)
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for inp, scores in train_loader:
            inp = inp.unsqueeze(1)
            inp = Variable(inp)
            out = model(inp).view((-1))

            scores = Variable(torch.DoubleTensor(scores))
            loss = criterion(out, scores)
            epoch_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
        model.setDropout(0.0)
        print(e)
        print(pd.DataFrame([[endo[i][0], endo[i][1], endo_scores(endo_dataset)[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman'))

In [494]:
args = {
    'kernel_num': [100,70, 60, 60],
    'dropout': 0.5,
    'fc1_out': 80,
    'fc2_out': 60,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 5
}
model = SimpleCNN(args)
train(model, train_set, 20, 160, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.468079
integrated    0.468079    1.000000
1
            endogenous  integrated
endogenous    1.000000    0.434543
integrated    0.434543    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.453665
integrated    0.453665    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.501382
integrated    0.501382    1.000000
4
            endogenous  integrated
endogenous     1.00000     0.56433
integrated     0.56433     1.00000
5
            endogenous  integrated
endogenous    1.000000    0.598452
integrated    0.598452    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.613061
integrated    0.613061    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.630376
integrated    0.630376    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.620186
integrated    0.620186    1.000000
9
            

In [495]:
train(model, train_set, 5, 100, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.637797
integrated    0.637797    1.000000
1
            endogenous  integrated
endogenous    1.000000    0.644123
integrated    0.644123    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.637111
integrated    0.637111    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.634417
integrated    0.634417    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.652041
integrated    0.652041    1.000000


In [496]:
train(model, train_set, 10, 100, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous      1.0000      0.6534
integrated      0.6534      1.0000
1
            endogenous  integrated
endogenous    1.000000    0.648365
integrated    0.648365    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.646414
integrated    0.646414    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.664277
integrated    0.664277    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.669432
integrated    0.669432    1.000000
5
            endogenous  integrated
endogenous     1.00000     0.67837
integrated     0.67837     1.00000
6
            endogenous  integrated
endogenous    1.000000    0.656076
integrated    0.656076    1.000000
7
            endogenous  integrated
endogenous     1.00000     0.65595
integrated     0.65595     1.00000
8
            endogenous  integrated
endogenous    1.000000    0.655068
integrated    0.655068    1.000000
9
            

In [480]:
train(model, train_set, 10, 100, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous    1.000000    0.612601
integrated    0.612601    1.000000
1
            endogenous  integrated
endogenous    1.000000    0.612393
integrated    0.612393    1.000000
2
            endogenous  integrated
endogenous     1.00000     0.60788
integrated     0.60788     1.00000
3
            endogenous  integrated
endogenous    1.000000    0.609152
integrated    0.609152    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.614974
integrated    0.614974    1.000000
5
            endogenous  integrated
endogenous    1.000000    0.618713
integrated    0.618713    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.615899
integrated    0.615899    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.603946
integrated    0.603946    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.611355
integrated    0.611355    1.000000
9
            

# Correlation

In [45]:
import pandas as pd

In [46]:
endo = load_data("../dataset/table3")



In [47]:
endo_dataset = DNADataset(endo,30)


In [48]:
def endo_scores(dataset):
    escores = []
    for inp, scores in DataLoader(dataset, batch_size=100):
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).view((-1))
        for v in out.detach().numpy():
            escores.append(v)
    return escores

In [49]:
escores = endo_scores(endo_dataset)

In [451]:
combined_df = pd.DataFrame([[endo[i][0], endo[i][1], escores[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'] )

In [452]:
combined_df[['endogenous', 'integrated']].corr(method='spearman')

Unnamed: 0,endogenous,integrated
endogenous,1.0,0.421199
integrated,0.421199,1.0
