In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict
import pandas as pd
import tensorwatch

In [2]:
# from tqdm import tqdm

In [3]:
# with tqdm(total=int(len())) as pbar:

In [2]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.FloatTensor(self.x)
        self.y = torch.FloatTensor(self.y)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [3]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [4]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)
endo = load_data("../dataset/table3")
endo_dataset = DNADataset(endo,30)

#
test_1a_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[1])[1]
test_1a = DNADataset([(test_1a_df[test_1a_df.columns[0]].values[i], float(test_1a_df[test_1a_df.columns[1]].values[i])) for i in range(len(test_1a_df)) ], 30)
test_1b_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[2])[2]
test_1b = DNADataset([(test_1b_df[test_1b_df.columns[0]].values[i], float(test_1b_df[test_1b_df.columns[1]].values[i])) for i in range(len(test_1b_df)) ], 30)

In [5]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [45]:
class SimpleCNN(BasicModule):
    def __init__(self,args):
        super(SimpleCNN, self).__init__()
        # output size of fc layers
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        self.params = args
        self.k = args['k_max']
        self.model_type = "CNN"
        
        # self.k = args['k'] # for k max pooling
        self.kernel_num = args['kernel_num']
        self.dropout = nn.Dropout(args['dropout'])
 
        
        # convolution layers
        filter_sizes = [3, 5, 7, 9]
        self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True, padding=0)
        self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True, padding=0)
        self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True, padding=0)
        self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True, padding=0)
#         self.conv5 = nn.Conv2d(1, self.kernel_num[4], (11, 4), bias=True, padding=0)

        
        # fc layers
        self.fc1 = nn.Linear((sum([self.kernel_num[i] * (31-filter_sizes[i]) // 2 for i in range(len(filter_sizes))])), out1, bias=True)
        self.fc2 = nn.Linear(out1, out2, bias=True)
        self.fc3 = nn.Linear(out2, out3, bias=False)
        

    def _conv_and_pool(self, x, conv):
        # x: (batch, 1, size, 4)
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.max_pool2d(x, kernel_size=(2,1), stride=(2,1))
        x = x.view(x.size(0), x.size(1) * x.size(2))
        #  (batch, kernel_num * k)
        return x
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch, size)
        # x: (batch, sentence_length, embed_dim)
        x = x.view(x.size(0),1, 30, 4)
        x1 = self._conv_and_pool(x, self.conv1) 
        x2 = self._conv_and_pool(x, self.conv2)  
        x3 = self._conv_and_pool(x, self.conv3)  
        x4 = self._conv_and_pool(x, self.conv4)  

        
        x = torch.cat((x1, x2, x3, x4), 1) 
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        return self.fc3(x)

In [183]:
def train(model, train_dataset, epochs, batch_size, val_set, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=0)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()


    print("Train %s samples."%n_training_samples)
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for inp, scores in train_loader:
            if model.model_type == "LSTM":
#                 print("INIT LSTM")
                model.init_hidden(batch_size)
            inp = inp.unsqueeze(1)
            inp = Variable(inp)
            out = model(inp).view((-1))

            scores = Variable(torch.FloatTensor(scores))
            loss = criterion(out, scores)
            epoch_train_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
        model.setDropout(0.0)
        print(e)
        tscores = test_scores(model, val_set, 400)
        corr = pd.DataFrame([[val_set[i][0], float(val_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
        print(corr)
        if corr >= 0.68:
            print("Saving.")
#             model.save("../model/%s_3579_90-60-40-110_80-60_90_%s.model"%(round(model.model_type, corr, 3)))

In [46]:
args = {
    'kernel_num': [90, 60, 40, 110],
    'dropout': 0.15,
    'fc1_out': 80,
    'fc2_out': 40,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 1
}
model = SimpleCNN(args)

In [47]:
train(model, train_set, 5 , 120, endo_dataset, cv=0.0, learning_rate=0.003, start=0, end=12832)

Train 12832.0 samples.
0
0.5747159716758458
1
0.6221117230527144
2
0.6281730920535011
3
0.6455137686860739
4
0.6514177812745869


In [10]:
# model(train_set.x[:120].view(120,30,4))

In [635]:
class SimpleLSTM(BasicModule):
    def __init__(self,args):
        super(SimpleLSTM, self).__init__()
        self.dropout = nn.Dropout(args['dropout'])
        self.params = args
        self.model_type = "LSTM"
#         self.conv1 = nn.Conv2d(1, args['out_channels'], (3, 4), padding=(1,0), bias=True)
#         self.conv2 = nn.Conv2d(args['out_channels'], args['out_channels']//2, (3, 1), padding=(1,0), bias=True)
        self.lstm = nn.LSTM(input_size=args['num_features'], 
                            hidden_size=args['hidden_size'], 
                            num_layers=args['num_layers'],
                            batch_first=True, 
                            bias=True,
                            dropout=args['dropout'], 
                            bidirectional=args['bidirectional'])
        N = 2 if self.params['bidirectional'] else 1
        self.fc1 = nn.Linear(args['hidden_size'] * args['k'] * 2 * N // 2, args['out1'], bias=True)
        self.fc2 = nn.Linear(args['out1'], args['out2'], bias=True)
        self.bn = nn.BatchNorm1d(args['out1'])
        self.relu = nn.ReLU(inplace=True)
        self.hidden = None
        
    def init_hidden(self, batch_size):
        N = 2 if self.params['bidirectional'] else 1
        hidden = (Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])),
                  Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])))
        self.hidden = hidden
        
    
    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, 30, 4)
#         x = self.conv1(x)
#         x = self.relu(x)
#         x = self.conv2(x).squeeze(3)
#         x = self.relu(x)
#         x = torch.transpose(x, 1, 2)
#         
        x, self.hidden = self.lstm(x, self.hidden)
#         print(x.shape)
        self.hidden = (Variable(self.hidden[0]), Variable(self.hidden[1]))
#         print(lstm_out.shape)
# #         masker = torch.Tensor([[[0]*29+[1]]*30] * batch_size)
#         # stack up lstm outputs
# #         lstm_out = lstm_out[:, -1:, :].contiguous().view(batch_size, -1)
#         lstm_out = torch.tanh(lstm_out)
#         lstm_out = torch.transpose(lstm_out, 1, 2)
#         
# #         print(lstm_out.shape)
#         
        out = torch.transpose(x, 1, 2)
#         out = F.max_pool1d(out, 2, stride=2)
        out = self.kmax_pooling(out, 2, self.params['k'])
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.bn(out)
        out = self.relu(out)
        out = self.fc2(out)

        
        return out
    
    def setDropout(self, dropout):
        self.dropout.p = dropout
        
    def kmax_pooling(self, x, dim, k):
        index = x.topk(k, dim=dim)[1].sort(dim=dim)[0]
        return x.gather(dim, index)

In [636]:
# class SimpleLSTMCNN(BasicModule):
#     def __init__(self,args):
#         super(SimpleLSTMCNN, self).__init__()
#         self.dropout = nn.Dropout(args['dropout'])
#         self.params = args
#         self.model_type = "LSTM"
#         self.conv1 = nn.Conv1d(args['hidden_size']*2, args['out_channels'], 1, padding=0, bias=True)
# #         self.conv2 = nn.Conv2d(args['out_channels'], args['out_channels']//2, (3, 1), padding=(1,0), bias=True)
#         self.lstm = nn.LSTM(input_size=args['num_features'], 
#                             hidden_size=args['hidden_size'], 
#                             num_layers=args['num_layers'],
#                             batch_first=True, 
#                             bias=True,
#                             dropout=args['dropout'], 
#                             bidirectional=args['bidirectional'])
#         N = 2 if self.params['bidirectional'] else 1
#         self.fc1 = nn.Linear(2500, args['out1'], bias=True)
#         self.fc2 = nn.Linear(args['out1'], args['out2'], bias=True)
#         self.bn = nn.BatchNorm1d(args['out1'])
#         self.relu = nn.ReLU(inplace=True)
#         self.hidden = None
        
#     def init_hidden(self, batch_size):
#         N = 2 if self.params['bidirectional'] else 1
#         hidden = (Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])),
#                   Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])))
#         self.hidden = hidden
        
    
#     def forward(self, x):
#         batch_size = x.size(0)
#         x = x.view(batch_size, 30, 4)
# #         x = self.conv1(x)
# #         x = self.relu(x)
# #         x = self.conv2(x).squeeze(3)
# #         x = self.relu(x)
# #         x = torch.transpose(x, 1, 2)
# #         
#         x, self.hidden = self.lstm(x, self.hidden)
# #         print(x.shape)
#         self.hidden = (Variable(self.hidden[0]), Variable(self.hidden[1]))
# #         print(lstm_out.shape)
# # #         masker = torch.Tensor([[[0]*29+[1]]*30] * batch_size)
# #         # stack up lstm outputs
# # #         lstm_out = lstm_out[:, -1:, :].contiguous().view(batch_size, -1)
# #         lstm_out = torch.tanh(lstm_out)
# #         lstm_out = torch.transpose(lstm_out, 1, 2)
# #         
# # #         print(lstm_out.shape)
# #         out = self.kmax_pooling(x, 1, self.params['k'])
#         out = torch.transpose(x, 1, 2)
# #         print(out.shape)
#         out = self.kmax_pooling(out, 2, self.params['k'])
# #         print(out.shape)
# #         out = F.max_pool1d(out,2, stride=2)
# #         out = torch.transpose(out, 1, 2)
# #         print(out.shape)
#         out = self.conv1(out)
# #         out = self.bn(out)
#         out = self.relu(out)
# #         print(out.shape)
#         out = out.view(out.size(0), -1)
#         out = self.dropout(out)
#         out = self.fc1(out)
#         out = self.bn(out)
#         out = self.relu(out)
#         out = self.fc2(out)

        
#         return out
    
#     def setDropout(self, dropout):
#         self.dropout.p = dropout
        
#     def kmax_pooling(self, x, dim, k):
#         index = x.topk(k, dim=dim)[1].sort(dim=dim)[0]
#         return x.gather(dim, index)

In [637]:
lstm_args = {
    'seq_length': 30,
    'out_channels':100,
    'bidirectional': True,
    'num_features': 4,
    'num_layers': 1,
    'hidden_size': 25,
    'dropout': 0.0,
    'out1': 80,
    'out2': 1,
    'k': 25,
}
lstm_model = SimpleLSTM(lstm_args)


In [638]:
lstm_out = lstm_model(train_set.x[:120].view(120, 1,30,4))

In [639]:
# h = model.init_hidden(3)
# model.init_hidden(120)
# out = model(train_set.x[:120].view(120,30,4))

In [640]:
train(lstm_model, train_set, 20, 401, endo_dataset, cv=0.0, learning_rate=0.003, start=0, end=12832)

Train 12832.0 samples.
0
0.4178505114083399
1
0.5606294256490952
2
0.5444846577498033
3
0.5322171518489378
4
0.5285098347757671
5
0.5438048780487805
6
0.5656333595594021
7
0.556456333595594
8
0.54360346184107
9
0.5635184893784422
10
0.5748796223446105
11
0.5922077104642014
12
0.5970857592446892
13
0.5779952793076318
14
0.5640472069236822
15
0.6070055074744296
16
0.5852777340676633
17
0.57533910306845
18
0.6107380015735642
19
0.5871156569630213


In [145]:
#
def test_scores(model, test_set, batch_size):
    tscores = []
    for inp, scores in DataLoader(test_set, batch_size=batch_size):
        model.zero_grad()
        if model.model_type == "LSTM":
#                 print("INIT LSTM")
            model.init_hidden(len(inp))
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).squeeze()
        for v in out.detach().numpy():
            tscores.append(v)
    return tscores

In [406]:
def check_corr(model, test_set):
    model.setDropout(0.0)
    tscores = test_scores(model, test_set, 400)
    corr = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman')
    print(corr)

In [491]:
check_corr(lstm_model, test_set)

            endogenous  integrated
endogenous    1.000000    0.767798
integrated    0.767798    1.000000
