In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
from collections import defaultdict
import pandas as pd

In [3]:
from tqdm import tqdm

In [4]:
# with tqdm(total=int(len())) as pbar:

In [2]:
class BasicModule(nn.Module):
    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name = str(type(self))

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def save(self, path):
        torch.save(self.state_dict(), path)

In [416]:
model.lstm

LSTM(4, 100, batch_first=True)

In [417]:
class SimpleLSTM(BasicModule):
    def __init__(self,args):
        super(SimpleLSTM, self).__init__()
        self.dropout = nn.Dropout(args['dropout'])
        self.params = args
        self.lstm = nn.LSTM(input_size=args['num_features'], hidden_size=args['hidden_size'], num_layers=args['num_layers'],
                            batch_first=True, dropout=args['dropout'], bidirectional=args['bidirectional'])
        self.fc1 = nn.Linear(args['hidden_size'], args['out1'], bias=True)
        self.fc2 = nn.Linear(args['out1'], args['out2'], bias=True)
        self.hidden = None
        
    def init_hidden(self, batch_size):
        N = 2 if self.params['bidirectional'] else 1
        hidden = (Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])),
                  Variable(torch.zeros(self.params['num_layers']*N, batch_size, self.params['hidden_size'])))
        self.hidden = hidden
        
    
    def forward(self, x):
        batch_size = x.size(0)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        self.hidden = (Variable(self.hidden[0]), Variable(self.hidden[1]))
#         print(lstm_out.shape, lstm_out[:, -2:, :].shape)
#         masker = torch.Tensor([[[0]*29+[1]]*30] * batch_size)
        # stack up lstm outputs
#         lstm_out = lstm_out[:, -1:, :].contiguous().view(batch_size, -1)
        lstm_out = torch.tanh(lstm_out)
        lstm_out = torch.transpose(lstm_out, 1, 2)
        lstm_out = F.max_pool1d(lstm_out, lstm_out.size(2)).squeeze(2)
#         print(lstm_out.shape)
#         out = self.dropout(lstm_out)
        out = self.fc1(lstm_out)
        out = torch.tanh(out)
        out = self.fc2(out)
#         out = (out * torch.Tensor([[0]]*29+[[1]])).sum(1)
#         out = (out * masker).sum(1)
#         print(out.shape)
        
        return out
    
    def setDropout(self, dropout):
        self.dropout.p = dropout

In [424]:
class DNADataset(Dataset):
    
    def __init__(self, data, size):
        super(DNADataset, self).__init__()
        self.size = size
        self.x = np.zeros((len(data),size, 4))
        self.y = []

        for I in range(len(data)):
            self.y.append(data[I][1])
            if type(data[I][0])==str:
                for i in range(size):
                    seq = data[I][0].lower()
                    # one hot encoding
                    pos = "acgt".find(seq[i])
                    if pos >= 0:
                        self.x[I][i][pos] = 1
#             else:
#                 self.x[I] = data[I][0]
        self.x = torch.FloatTensor(self.x)
        self.y = torch.FloatTensor(self.y)

        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    # return a subset of dataset of given range
    def get_subset(self, start, end):
        if end == 0:
            return self
        return DNADataset([(self.x[i],self.y[i]) for i in range(start, end)], self.size)

In [332]:
#
def load_data(file):
    data = []
    with open(file,"r") as f:
        for line in f:
            line = line.split()
            data.append((line[0], float(line[1])))
    return data

In [425]:
#
train_set = DNADataset(load_data("../dataset/train_txt"),30)
test_set = DNADataset(load_data("../dataset/test_txt"),30)
endo = load_data("../dataset/table3")
endo_dataset = DNADataset(endo,30)

In [440]:
args = {
    'bidirectional': False,
    'num_features': 4,
    'num_layers': 1,
    'hidden_size': 100,
    'dropout': 0.0,
    'out1': 60,
    'out2': 1
}
model = SimpleLSTM(args)


In [444]:
# h = model.init_hidden(3)
model.init_hidden(120)
model.eval()
out = model(train_set.x[:120].view(120,30,4))

In [445]:
out

tensor([[41.0392],
        [41.0391],
        [41.0391],
        [41.0392],
        [41.0390],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0392],
        [41.0390],
        [41.0391],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0391],
        [41.0392],
        [41.0392],
        [41.0391],
        [41.0390],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0392],
        [41.0391],
        [41.0392],
        [41.0390],
        [41.0392],
        [41.0391],
        [41.0392],
        [41.0391],
        [41.0391],
        [41.0391],
        [41.0392],
        [41.0390],
        [41.0391],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0390],
        [41.0392],
        [41.0392],
        [41.0390],
        [41.0390],
        [41.

In [443]:
# model.init_hidden(120)
train(model, train_set, 1, 1, endo_dataset, cv=0.0, learning_rate=0.003, start=0, end=1200)

Train 1200.0 samples.
741341.8553246165


In [352]:
train(model, train_set, 2, 120, endo_dataset, cv=0.0, learning_rate=0.01, start=0, end=12832)

Train 12832.0 samples.
0.0024108947063723463
0.0005728969936171779


In [315]:
train(model, train_set, 2, 120, endo_dataset, cv=0.0, learning_rate=0.003, start=0, end=12832)

Train 12832.0 samples.
0.0009789430991986592
0.0005820328392474039


In [289]:
model.dropout

Dropout(p=0.3, inplace=False)

In [277]:
out

tensor([[0.0034],
        [0.0039],
        [0.0043]], grad_fn=<AddmmBackward>)

In [314]:
def train(model, train_dataset, epochs, batch_size, val_set, cv=0.1, learning_rate=0.001, start = 0, end = 0):
    subset = train_dataset.get_subset(start, end)
    n_training_samples = len(subset) * (1-cv)
    n_val_samples = len(subset) * cv
    train_loader =torch.utils.data.DataLoader(subset, batch_size=batch_size,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, dtype=np.int64)
                                              ),
                                              num_workers=0)
    val_loader =torch.utils.data.DataLoader(subset, batch_size=100,
                                              sampler=SubsetRandomSampler(
                                                  np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)
                                              ), num_workers=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    print("Train %s samples."%n_training_samples)
    torch.autograd.set_detect_anomaly(False)
    
    
    for e in range(epochs):
        # train loss
        epoch_train_loss = 0.0
#         model.setDropout(model.params['dropout'])
#         h = model.init_hidden(batch_size)

        for inp, scores in train_loader:
            model.zero_grad()
            inp = Variable(inp)
            out = model(inp)
#             out = out.squeeze()

            scores = Variable(torch.FloatTensor(scores))
#             print(scores, out)
            loss = criterion(out.squeeze(), scores.squeeze())
            epoch_train_loss += float(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print(epoch_train_loss)
        # validation loss
#         epoch_val_loss = 0.0
#         for inp, labels in val_loader:
#             inp = inp.unsqueeze(1)
#             out = model(inp)        
#             loss = criterion(out, labels)
#             epoch_val_loss += loss


#         print(str(epoch_train_loss) + ' ')
#         print(str(epoch_val_loss.tolist()) + '\n')
#         model.setDropout(0.0)
#         print(e)
#         tscores = test_scores(model, val_set, 400)
#         corr = pd.DataFrame([[val_set[i][0], float(val_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'])\
#     [['endogenous', 'integrated']].corr(method='spearman')['endogenous']['integrated']
#         print(corr)
#         if corr >= 0.68:
#             print("Saving.")
#             model.save("../model/cnn_3579_90-60-40-110_80-60_90_%s.model"%(round(corr, 3)))

In [281]:
float(torch.Tensor([2])[0])

2.0

Train 12832.0 samples.
3.1046278907515443
0.0012913294376630802
0.0010612283604132244
0.000902866102023836
0.0008657001490064431
0.0008301812645186146
0.0008023533578125353
0.0008027298767956381
0.0008257292215603229
0.0008005101835806272
0.0007707490303801023
0.0007802022068972292
0.0008914617656046175


KeyboardInterrupt: 

In [371]:
#
def test_scores(model, test_set, batch_size):
    tscores = []
    for inp, scores in DataLoader(test_set, batch_size=batch_size):
        model.zero_grad()
        inp = Variable(inp)
        out = model(inp)
        for v in out.detach().numpy():
            tscores.append(v)
    return tscores

In [372]:
tscores = test_scores(model, test_set.get_subset(0,120), 120)
combined_df = pd.DataFrame([[test_set[i][0], float(test_set[i][1]), tscores[i]] for i in range(len(tscores))], columns=['seq', 'endogenous', 'integrated'] )

torch.Size([120, 100])


In [373]:
combined_df

Unnamed: 0,seq,endogenous,integrated
0,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.441258,[0.0054920986]
1,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.121632,[0.004570067]
2,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.689013,[0.005301632]
3,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.131357,[0.004474297]
4,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.483552,[0.004242435]
...,...,...,...
115,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.711395,[0.0054760613]
116,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.057450,[0.0041618384]
117,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.371841,[0.005058266]
118,"[[tensor(1.), tensor(0.), tensor(0.), tensor(0...",0.262636,[0.0037241094]
