In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict
import pandas as pd


In [2]:
from tqdm import tqdm

In [3]:
# with tqdm(total=int(len())) as pbar:

In [4]:
class SeqDataset(Dataset):
    
    def __init__(self, data, size):
        super(SeqDataset, self).__init__()
        self.size = size
        self.x = []
        self.y = []
        match = {
            'X':0,
            'N':0,
            'A':1,
            'C':2,
            'G':3,
            'T':4
        }

        for I in range(len(data)):
            self.y.append(data[I][1])
            self.x.append([match[b] for b in data[I][0]])
            
        self.x = torch.LongTensor(self.x)
        self.y = torch.FloatTensor(self.y)/100

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return SeqDataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [89]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[0][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
            else:
                self.x[I] = data[I][0]
        self.x = torch.FloatTensor(self.x)
        self.y = torch.FloatTensor(self.y) / 100

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [6]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [108]:
train_set = SeqDataset(load_data("../dataset/train_txt"),30)
test_set = SeqDataset(load_data("../dataset/test_txt"),30)
endo = load_data("../dataset/table3")
endo_dataset = SeqDataset(endo,30)

In [90]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)
endo = load_data("../dataset/table3")
endo_dataset = DNADataset(endo,30)

# #
# test_1a_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[1])[1]
# test_1a = DNADataset([(test_1a_df[test_1a_df.columns[0]].values[i], float(test_1a_df[test_1a_df.columns[1]].values[i])) for i in range(len(test_1a_df)) ], 30)
# test_1b_df = pd.read_excel("../dataset/aax9249_table_s3.xlsx", sheet_name=[2])[2]
# test_1b = DNADataset([(test_1b_df[test_1b_df.columns[0]].values[i], float(test_1b_df[test_1b_df.columns[1]].values[i])) for i in range(len(test_1b_df)) ], 30)

In [8]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [9]:
class SimpleCNN(BasicModule):
    def __init__(self,args):
        super(SimpleCNN, self).__init__()
        # output size of fc layers
        out1 = args['fc1_out']
        out2 = args['fc2_out']
        out3 = args['fc3_out'] # output layer
        self.params = args
        self.k = args['k_max']
        self.model_type = "CNN"
        
        # self.k = args['k'] # for k max pooling
        self.kernel_num = args['kernel_num']
        self.dropout = nn.Dropout(args['dropout'])
 
        
        # convolution layers
        filter_sizes = [3, 5, 7, 9]
        self.conv1 = nn.Conv2d(1, self.kernel_num[0], (3, 4), bias=True, padding=0)
        self.conv2 = nn.Conv2d(1, self.kernel_num[1], (5, 4), bias=True, padding=0)
        self.conv3 = nn.Conv2d(1, self.kernel_num[2], (7, 4), bias=True, padding=0)
        self.conv4 = nn.Conv2d(1, self.kernel_num[3], (9, 4), bias=True, padding=0)
#         self.conv5 = nn.Conv2d(1, self.kernel_num[4], (11, 4), bias=True, padding=0)

        
        # fc layers
        self.fc1 = nn.Linear((sum([self.kernel_num[i] * (31-filter_sizes[i]) // 2 for i in range(len(filter_sizes))])), out1, bias=True)
        self.fc2 = nn.Linear(out1, out2, bias=True)
        self.fc3 = nn.Linear(out2, out3, bias=False)
        

    def _conv_and_pool(self, x, conv):
        # x: (batch, 1, size, 4)
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.max_pool2d(x, kernel_size=(2,1), stride=(2,1))
        x = x.view(x.size(0), x.size(1) * x.size(2))
        #  (batch, kernel_num * k)
        return x
    
    def setDropout(self, dropout):
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch, size)
        # x: (batch, sentence_length, embed_dim)
        x = x.view(x.size(0),1, 30, 4)
        x1 = self._conv_and_pool(x, self.conv1) 
        x2 = self._conv_and_pool(x, self.conv2)  
        x3 = self._conv_and_pool(x, self.conv3)  
        x4 = self._conv_and_pool(x, self.conv4)  

        
        x = torch.cat((x1, x2, x3, x4), 1) 
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        return self.fc3(x)

In [42]:
def test_scores(model, test_set, batch_size):
    tscores = []
    for inp, scores in DataLoader(test_set, batch_size=batch_size):
        model.zero_grad()
        # inp = inp.unsqueeze(1)
        inp = Variable(inp)
        if torch.cuda.is_available():
            inp = inp.cuda()
        out = model(inp).squeeze()
        for v in out.cpu().detach().numpy():
            tscores.append(v)
    return tscores

def check_corr(model, test_set):
    model.setDropout(0.0)
    tscores = test_scores(model, test_set, 400)
    corr = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
    return corr

In [73]:
def train(model, train_dataset, epochs, batch_size, val_set, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=0)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
        model.setDropout(model.params['dropout'])
        for (ii, (inp, scores)) in tqdm(enumerate(train_loader)):
            # inp = inp.unsqueeze(1)
            inp = Variable(inp.cuda())
            scores = Variable(scores).cuda()
            optimizer.zero_grad()
            out = model(inp).squeeze()
            loss = criterion(out, scores)
            epoch_train_loss += float(loss)
            loss.backward()
            optimizer.step()
                
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
        model.setDropout(0.0)
        print(e)
    #     tscores = test_scores(model, val_set, 400)
    #     corr = pd.DataFrame([[val_set[i][0], float(val_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
    # [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
    #     print(corr)
        endo_score, test_score = (check_corr(model, val_set), check_corr(model, test_set))
        print(endo_score, test_score)
        if endo_score >= 0.68 and test_score>=0.77:
            print("Saving.")
            model.save("%s_%s.model"%(model.params_to_str(), int(round(endo_score, 3)*100)))
            break

In [12]:
args = {
    'kernel_num': [90, 60, 40, 110],
    'dropout': 0.15,
    'fc1_out': 80,
    'fc2_out': 40,
    'fc3_out': 1,
    'input_max_length':30,
    'k_max' : 1
}
model = SimpleCNN(args).cuda()

In [13]:
# train(model, train_set, 10 , 100, endo_dataset, cv=0.0, learning_rate=0.002, start=0, end=12832)

In [14]:
# model(train_set.x[:120].view(120,30,4))

In [84]:
class SimpleLSTM(BasicModule):
    def __init__(self,args):
        super(SimpleLSTM, self).__init__()
        self.dropout = nn.Dropout(args['dropout'])
        self.params = args
        self.model_type = "LSTM"
#         self.conv1 = nn.Conv2d(1, args['out_channels'], (3, 4), padding=(1,0), bias=True)
#         self.conv2 = nn.Conv2d(args['out_channels'], args['out_channels']//2, (3, 1), padding=(1,0), bias=True)
        self.lstm = nn.LSTM(input_size=args['num_features'], 
                            hidden_size=args['hidden_size'], 
                            num_layers=args['num_lstm_layers'],
                            batch_first=True, 
                            bias=True,
                            dropout=args['dropout'], 
                            bidirectional=args['bidirectional'])
        N = 2 if self.params['bidirectional'] else 1
        # self.fc1 = nn.Linear(args['seq_length'] * args['k'] * N //2, args['out1'], bias=True)
        self.fc1 = nn.Linear(args['hidden_size'] * args['k']  * N, args['out1'], bias=True)
        self.fc2 = nn.Linear(args['out1'], args['out2'], bias=True)
        self.bn = nn.BatchNorm1d(args['out1'])
        self.relu = nn.ReLU(inplace=True)
        self.hidden = None
        
    # def init_hidden(self, batch_size):
    #     N = 2 if self.params['bidirectional'] else 1
    #     hidden = (Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])),
    #               Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])))
    #     self.hidden = hidden
        
    
    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, 30, 4)
#         x = self.conv1(x)
#         x = self.relu(x)
#         x = self.conv2(x).squeeze(3)
#         x = self.relu(x)
#         x = torch.transpose(x, 1, 2)
#         
        out, _ = self.lstm(x)
#         print(x.shape)
        # self.hidden = (Variable(self.hidden[0]), Variable(self.hidden[1]))
#         print(lstm_out.shape)
# #         masker = torch.Tensor([[[0]*29+[1]]*30] * batch_size)
#         # stack up lstm outputs
# #         lstm_out = lstm_out[:, -1:, :].contiguous().view(batch_size, -1)
#         lstm_out = torch.tanh(lstm_out)
#         lstm_out = torch.transpose(lstm_out, 1, 2)
#         
        # shape: (batchsize, seq_length, hiddensize * bidirectional)
        out = torch.transpose(out, 1, 2)
        # print(out.shape)
#       out = F.max_pool1d(out, 2, stride=2)
        out = self.kmax_pooling(out, 2, self.params['k'])
        # print(out.shape)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.bn(out)
        out = self.relu(out)
        out = self.fc2(out)

        
        return out
    
    def setDropout(self, dropout):
        self.dropout.p = dropout
        self.lstm.dropout = dropout

    def kmax_pooling(self, x, dim, k):
        index = x.topk(k, dim=dim)[1].sort(dim=dim)[0]
        return x.gather(dim, index)

    def params_to_str(self):
        return "%s_hidden%s_layer%s_drpout%s_fc%s_k%s_"%('bilstm' if self.params['bidirectional'] else 'lstm', 
                                                  self.params['hidden_size'],
                                                  self.params['num_lstm_layers'],
                                                  int(self.dropout.p * 10),
                                                  self.params['out1'], 
                                                  self.params['k'])

In [132]:
class UnsupLSTM(BasicModule):
    def __init__(self,args):
        super(UnsupLSTM, self).__init__()
        self.dropout = nn.Dropout(args['dropout'])
        self.params = args
        self.model_type = "LSTM"
#         self.conv1 = nn.Conv2d(1, args['out_channels'], (3, 4), padding=(1,0), bias=True)
#         self.conv2 = nn.Conv2d(args['out_channels'], args['out_channels']//2, (3, 1), padding=(1,0), bias=True)
        N = 2 if self.params['bidirectional'] else 1
        # self.fc1 = nn.Linear(args['seq_length'] * args['k'] * N //2, args['out1'], bias=True)
        self.fc1 = nn.Linear(args['hidden_size'] * args['k']  * N, args['out1'], bias=True)
        self.fc2 = nn.Linear(args['out1'], args['out2'], bias=True)
        self.bn = nn.BatchNorm1d(args['out1'])
        self.relu = nn.ReLU(inplace=True)
        self.embed = nn.Embedding(5,args.get('embed_dim', 2)).requires_grad_(True)
        self.lstm = nn.LSTM(input_size=self.embed.weight.shape[1], 
                            hidden_size=args['hidden_size'], 
                            num_layers=args['num_lstm_layers'],
                            batch_first=True, 
                            bias=True,
                            dropout=args['dropout'], 
                            bidirectional=args['bidirectional'])
        
    # def init_hidden(self, batch_size):
    #     N = 2 if self.params['bidirectional'] else 1
    #     hidden = (Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])),
    #               Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])))
    #     self.hidden = hidden
        
    
    def forward(self, x):
        batch_size = x.size(0)
        x = self.embed(x)
        # print(x.shape)   
        out, _ = self.lstm(x)
#         
        # self.hidden = (Variable(self.hidden[0]), Variable(self.hidden[1]))
#         print(lstm_out.shape)
# #         masker = torch.Tensor([[[0]*29+[1]]*30] * batch_size)
#         # stack up lstm outputs
# #         lstm_out = lstm_out[:, -1:, :].contiguous().view(batch_size, -1)
#         lstm_out = torch.tanh(lstm_out)
#         lstm_out = torch.transpose(lstm_out, 1, 2)
#         
        # shape: (batchsize, seq_length, hiddensize * bidirectional)
        out = torch.transpose(out, 1, 2)
#       out = F.max_pool1d(out, 2, stride=2)
        out = self.kmax_pooling(out, 2, self.params['k'])
        # print(out.shape)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.bn(out)
        out = self.relu(out)
        out = self.fc2(out)

        
        return out
    
    def setDropout(self, dropout):
        self.dropout.p = dropout
        self.lstm.dropout = dropout
        
    def kmax_pooling(self, x, dim, k):
        index = x.topk(k, dim=dim)[1].sort(dim=dim)[0]
        return x.gather(dim, index)

    def params_to_str(self):
        return "%s_hidden%s_layer%s_drpout%s_fc%s_k%s_"%('bilstm' if self.params['bidirectional'] else 'lstm', 
                                                  self.params['hidden_size'],
                                                  self.params['num_lstm_layers'],
                                                  int(self.dropout.p * 10),
                                                  self.params['out1'], 
                                                  self.params['k'])

In [140]:
UnsupLSTM(lstm_args).embed.weight.shape

torch.Size([5, 4])

In [139]:
lstm_args = {
    'seq_length': 30,
    'embed_dim': 4,
    'bidirectional': True,
    'num_features': 4,
    'num_lstm_layers': 2,
    'hidden_size': 16,
    'dropout': 0.5,
    'out1': 80,
    'out2': 1,
    'k': 30,
}
# lstm_model = SimpleLSTM(lstm_args).cuda()
# lstm_model(train_set.x[:5].cuda())
ulstm_model = UnsupLSTM(lstm_args).cuda()
ulstm_model(train_set.x[:5].cuda())

tensor([[ 0.0491],
        [-0.3558],
        [ 0.0557],
        [-0.4663],
        [-0.2980]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [123]:
# ulstm_model.save("../model/bilstm_layer2_drpout5_fc80_k30_688.model")

In [143]:
train(ulstm_model, train_set, 25, 64, endo_dataset, cv=0.0, learning_rate=0.001, start=0, end=12832)

201it [00:01, 101.87it/s]
10it [00:00, 92.40it/s]

0
0.6514870180959874 0.7726939379666363


201it [00:02, 97.51it/s]
9it [00:00, 87.57it/s]

1
0.648383949645948 0.7736318127675104


201it [00:02, 94.89it/s]
9it [00:00, 85.13it/s]

2
0.6420645161290323 0.7762588269156351


201it [00:02, 95.76it/s]
9it [00:00, 86.53it/s]

3
0.6466467348544453 0.7765575842051453


201it [00:02, 95.65it/s]
9it [00:00, 86.91it/s]

4
0.6425491738788356 0.7670576971823114


201it [00:02, 94.71it/s]
1it [00:00,  9.25it/s]

5
0.6407301337529504 0.7711141570520493


201it [00:02, 90.00it/s]
9it [00:00, 86.05it/s]

6
0.6356947285601888 0.7761051523159097


201it [00:02, 93.60it/s]
9it [00:00, 86.49it/s]

7
0.6386026750590087 0.7686307182410883


201it [00:02, 91.49it/s]
9it [00:00, 84.83it/s]

8
0.6454697088906373 0.7686479020609693


201it [00:02, 90.42it/s]
9it [00:00, 81.34it/s]

9
0.627795436664044 0.7678744794310618


201it [00:02, 91.19it/s]
9it [00:00, 87.62it/s]

10
0.6163587726199843 0.7630289436951415


201it [00:02, 93.61it/s]
1it [00:00,  8.63it/s]

11
0.6258882769472855 0.7706887067747323


201it [00:02, 89.18it/s]
9it [00:00, 83.73it/s]

12
0.6378788355625492 0.7747196691541879


201it [00:02, 91.03it/s]
9it [00:00, 83.61it/s]

13
0.6260015735641228 0.7678057441515378


201it [00:02, 91.41it/s]
9it [00:00, 81.54it/s]

14
0.636324154209284 0.7680929701934962


201it [00:02, 92.33it/s]
9it [00:00, 86.17it/s]

15
0.6306530291109362 0.7614557951320876


201it [00:02, 92.96it/s]
9it [00:00, 84.96it/s]

16
0.6359150275373722 0.7676389555840085


201it [00:02, 92.97it/s]
9it [00:00, 82.65it/s]

17
0.6281667977970102 0.7649730517382584


201it [00:02, 92.91it/s]
1it [00:00,  8.91it/s]

18
0.6303383162863887 0.7603805251397966


201it [00:02, 88.01it/s]
9it [00:00, 82.03it/s]

19
0.629243115656963 0.7596843543317225


201it [00:02, 91.08it/s]
9it [00:00, 83.63it/s]

20
0.6287710464201416 0.7700559955120083


201it [00:02, 90.71it/s]
9it [00:00, 84.49it/s]

21
0.6255169158143195 0.7676260677190978


201it [00:02, 89.92it/s]
9it [00:00, 82.39it/s]

22
0.6485161290322581 0.7710027636933471


201it [00:02, 88.18it/s]
9it [00:00, 87.38it/s]

23
0.6381180173092054 0.7676404629366297


201it [00:02, 91.50it/s]

24
0.6303446105428796 0.7592017753900643





In [144]:
train(ulstm_model, train_set, 25, 64, endo_dataset, cv=0.0, learning_rate=0.001, start=0, end=12832)

201it [00:02, 94.01it/s]
0it [00:00, ?it/s]

0
0.6329756097560976 0.7611645746056831


201it [00:02, 94.69it/s]
9it [00:00, 82.24it/s]

1
0.6362863886703383 0.7671437148803326


201it [00:02, 92.31it/s]
9it [00:00, 84.98it/s]

2
0.6265491738788356 0.7564083495125655


201it [00:02, 93.28it/s]
9it [00:00, 84.91it/s]

3
0.62479307631786 0.7601850215048346


201it [00:02, 93.24it/s]
9it [00:00, 85.78it/s]

4
0.6318237608182533 0.7604296648352458


201it [00:02, 93.43it/s]
9it [00:00, 87.78it/s]

5
0.6435121951219512 0.7635109950633825


201it [00:02, 93.63it/s]
1it [00:00,  8.98it/s]

6
0.6242140047206923 0.7574096084911585


201it [00:02, 87.80it/s]
9it [00:00, 84.64it/s]

7
0.6274807238394965 0.7635948038691179


201it [00:02, 90.49it/s]
9it [00:00, 82.43it/s]

8
0.635014948859166 0.7571855405240259


201it [00:02, 90.64it/s]
9it [00:00, 84.04it/s]

9
0.6353044846577498 0.7590399610861848


201it [00:02, 90.54it/s]
9it [00:00, 88.27it/s]

10
0.6354429583005508 0.7603487953671215


201it [00:02, 98.19it/s]
10it [00:00, 91.46it/s]

11
0.6197324940991346 0.7505742165600698


201it [00:02, 100.45it/s]
10it [00:00, 91.71it/s]

12
0.6289976396538159 0.7635351880729518


201it [00:02, 99.55it/s]
2it [00:00, 19.60it/s]

13
0.6242077104642014 0.7610087897122882


201it [00:02, 96.44it/s]
9it [00:00, 87.87it/s]

14
0.6204311565696302 0.7508036356290075


201it [00:02, 100.34it/s]
10it [00:00, 92.64it/s]

15
0.6281605035405192 0.7480255847482444


201it [00:02, 100.00it/s]
10it [00:00, 91.32it/s]

16
0.6165601888276947 0.7452096239490597


201it [00:02, 99.88it/s] 
10it [00:00, 92.93it/s]

17
0.6271093627065303 0.75348182439862


201it [00:02, 99.85it/s] 
9it [00:00, 82.47it/s]

18
0.6267883556254917 0.7581436892176542


201it [00:02, 99.68it/s] 
1it [00:00,  8.71it/s]

19
0.6355940204563336 0.7515520362054041


201it [00:02, 94.70it/s]
10it [00:00, 91.99it/s]

20
0.6242140047206923 0.7565761178592985


201it [00:02, 98.15it/s] 
9it [00:00, 84.71it/s]

21
0.6186750590086546 0.7533546038373957


201it [00:02, 97.06it/s]
10it [00:00, 93.41it/s]

22
0.6442675059008655 0.757152680236885


201it [00:02, 98.65it/s]
9it [00:00, 84.00it/s]

23
0.6326986624704957 0.753876750785359


201it [00:02, 97.89it/s]

24
0.6261148701809599 0.7528584587221471





In [114]:
# train(lstm_model, train_set, 25, 64, endo_dataset, cv=0.0, learning_rate=0.0005, start=0, end=12832)

In [115]:
# train(lstm_model, train_set, 25, 64, endo_dataset, cv=0.0, learning_rate=0.0005, start=0, end=12832)

In [145]:
#
def test_scores(model, test_set, batch_size):
    tscores = []
    for inp, scores in DataLoader(test_set, batch_size=batch_size):
        model.zero_grad()
        if model.model_type == "LSTM":
#                 print("INIT LSTM")
            model.init_hidden(len(inp))
        inp = inp.unsqueeze(1)
        inp = Variable(inp)
        out = model(inp).squeeze()
        for v in out.detach().numpy():
            tscores.append(v)
    return tscores

In [299]:
def check_corr(model, test_set):
    model.setDropout(0.0)
    tscores = test_scores(model, test_set, 500)
    corr = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
    [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
    return corr

In [491]:
check_corr(lstm_model, test_set)

            endogenous  integrated
endogenous    1.000000    0.767798
integrated    0.767798    1.000000
