In [1]:
import sys
sys.executable

'C:\\ProgramData\\Anaconda3\\python.exe'

In [2]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict


# Models

In [3]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [4]:
# -*- coding: utf-8 -*-


class SimpleCNN(BasicModule):
    def __init__(self,args):
        super(SimpleCNN, self).__init__()
        # output size of fc layers
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        self.params = args
        self.k = args['k_max']
        
        # self.k = args['k'] # for k max pooling
        self.kernel_num = args['kernel_num']
        self.dropout = nn.Dropout(args['dropout'])
 
        
        # convolution layers
        self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True)
        self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True)
        self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True)
        self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True)
        self.conv5 = nn.Conv2d(1, 100, (11, 4), bias=True)
        
        # fc layers
        self.fc1 = nn.Linear((sum(self.kernel_num)+100)*self.k, out1, bias=True)
        self.fc2 = nn.Linear(out1, out2, bias=True)
        self.fc3 = nn.Linear(out2, out3, bias=True)
        
        # init
#         init.normal_(self.conv1.weight)
#         init.normal_(self.conv2.weight)
#         init.normal_(self.conv3.weight)
#         init.normal_(self.conv1.bias)
#         init.normal_(self.conv2.bias)
#         init.normal_(self.conv3.bias)
        
#         init.normal_(self.fc1.weight)
#         init.normal_(self.fc2.weight)
#         init.normal_(self.fc3.weight)

    def _conv_and_pool(self, x, conv):
        # x: (batch, 1, size, 4)
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x.squeeze(3))
        # x: (batch, kernel_num, H_out)
#         x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = self.kmax_pooling(x, 2, k=self.k)
        x = x.view(x.size(0), x.size(1) * x.size(2))
        #  (batch, kernel_num * k)
        return x
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        # x: (batch, size)
        # x: (batch, sentence_length, embed_dim)
        x = x.view(x.size(0),1, 30, 4)
        x1 = self._conv_and_pool(x, self.conv1)  # (batch, kernel_num * k)
        x2 = self._conv_and_pool(x, self.conv2)  # (batch, kernel_num * k)
        x3 = self._conv_and_pool(x, self.conv3)  # (batch, kernel_num * k)
        x4 = self._conv_and_pool(x, self.conv4)  # (batch, kernel_num * k)
        x5 = self._conv_and_pool(x, self.conv5)  # (batch, kernel_num * k)

        
        x = torch.cat((x1, x2, x3, x4, x5), 1)  # (batch, 4 * kernel_num * k)
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
#         x = 
#         logit = F.log_softmax(x, dim=1)
        return self.fc3(x)
    
    def kmax_pooling(self, x, dim, k):
        index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
        return x.gather(dim, index)

In [5]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.DoubleTensor(self.x)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [6]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [7]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)


In [15]:
def train(model, train_dataset, epochs, batch_size, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=0)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    model.double()

    print("Train %s samples."%n_training_samples)
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for inp, scores in train_loader:
            inp = inp.unsqueeze(1)
            inp = Variable(inp)
            out = model(inp).view((-1))

            scores = Variable(torch.DoubleTensor(scores))
            loss = criterion(out, scores)
            epoch_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
        model.setDropout(0.0)
        print(e)
        print(pd.DataFrame([[endo[i][0], endo[i][1], endo_scores(endo_dataset)[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman'))

# Correlation

In [9]:
import pandas as pd

In [16]:
endo = load_data("../dataset/table3")



In [17]:
def endo_scores(dataset):
    escores = []
    for inp, scores in DataLoader(dataset, batch_size=100):
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).view((-1))
        for v in out.detach().numpy():
            escores.append(v)
    return escores

In [18]:
endo_dataset = DNADataset(endo,30)


# Training


In [23]:
args = {
    'kernel_num': [100,70, 60, 60],
    'dropout': 0.5,
    'fc1_out': 80,
    'fc2_out': 60,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 5
}
# model = SimpleCNN(args)
train(model, train_set, 20, 120, cv=0.0, start=0, end=12833)

Train 12833.0 samples.
0
            endogenous  integrated
endogenous     1.00000     0.59054
integrated     0.59054     1.00000
1
            endogenous  integrated
endogenous    1.000000    0.576734
integrated    0.576734    1.000000
2
            endogenous  integrated
endogenous    1.000000    0.598766
integrated    0.598766    1.000000
3
            endogenous  integrated
endogenous    1.000000    0.581077
integrated    0.581077    1.000000
4
            endogenous  integrated
endogenous    1.000000    0.591507
integrated    0.591507    1.000000
5
            endogenous  integrated
endogenous    1.000000    0.576218
integrated    0.576218    1.000000
6
            endogenous  integrated
endogenous    1.000000    0.606386
integrated    0.606386    1.000000
7
            endogenous  integrated
endogenous    1.000000    0.575941
integrated    0.575941    1.000000
8
            endogenous  integrated
endogenous    1.000000    0.593917
integrated    0.593917    1.000000
9
            

In [450]:
escores = endo_scores(endo_dataset)

In [451]:
combined_df = pd.DataFrame([[endo[i][0], endo[i][1], escores[i]] for i in range(len(endo))], columns=['seq', 'endogenous', 'integrated'] )

In [452]:
combined_df[['endogenous', 'integrated']].corr(method='spearman')

Unnamed: 0,endogenous,integrated
endogenous,1.0,0.421199
integrated,0.421199,1.0
