In [1]:
import torch
import torch.nn as nn 
import pandas as pd
import numpy as np
import math, copy, time
import tqdm
import torch.nn.functional as F 
from torch.nn import functional as F
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler
import warnings
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, random_split
from datetime import date, datetime, time
from babel.dates import format_date, format_datetime, format_time
import torch.nn.functional as F
import math, copy
import time
from torch.autograd import Variable

def clones(module, N):


    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

if not torch.cuda.is_available():
    warnings.warn('CUDA is not available.')


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


DNA_ONE_HOT = {'A': [0, 0, 0, 1],
               'C': [0, 0, 1, 0],
               'G': [0, 1, 0, 0],
               'T': [1, 0, 0, 0]}

MAX_LEN = 60


def make_dataset(file_name='variable_Train_length(60)_dataset', targets=DNA_ONE_HOT):
    dataset = pd.read_csv(file_name, header=None, delimiter=',', dtype={0: str, 1: float})
    # np_data = dataset.to_numpy()
    x_array = dataset.values[:, 0].astype(str)
    y_array = dataset.values[:, 1].astype(float)

    # *** with max length of aptamer sequence 60 ***
    x_one_hot_array = np.zeros((len(x_array), MAX_LEN, 4))

    for i, sequence in enumerate(x_array):
        one_hot = np.array([targets[letter] for letter in sequence])
        padded_one_hot = np.pad(one_hot, ((0, MAX_LEN-one_hot.shape[0]), (0, 0)), mode='constant', constant_values=0)
        x_one_hot_array[i] = padded_one_hot
    
    x_tensor = torch.tensor(x_one_hot_array)    ### x_tensor.shape: [10000, 60, 4]
    y_tensor = torch.tensor(y_array)            ### y_tensor.shape: [10000]
    return TensorDataset(x_tensor, y_tensor)
    
def split_dataset(dataset, seed=42, train_split=0.8):
    n_examples = len(dataset)                  ### n_examples = 10000
    nb_train = int(n_examples * train_split)   ### nb_train = 8000
    train, test = random_split(dataset, [nb_train, n_examples-nb_train], generator=torch.Generator().manual_seed(seed))
    return train, test

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def train_epoch(model, data_loader, optimizer: torch.optim, loss_func=nn.MSELoss(), device=DEVICE):

    model.train()
    train_loss = 0
    train_acc = 0
    for batch in data_loader:
        optimizer.zero_grad()
        x = batch[0]
        y = batch[1]
        x = x.to(device)
        y = y.to(device)
        b_labels = batch[1].to(device)
        labels = b_labels.to('cpu').numpy()
        output = model.forward(x, mask=None)
        loss = loss_func(output, y)
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            train_loss += loss.sum().cpu().numpy()
        #tmp_train_accuracy = flat_accuracy(logits, labels)
        #train_acc += tmp_train_accuracy
        #histo'train_acc'].append(train_acc.item())
    print("avg_train loss :", train_loss/len(data_loader))
    #print("Training accuracy: {}", train_acc)

def valid_epoch(model, data_loader, loss_func=nn.MSELoss(), device=DEVICE):

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            x = batch[0]
            y = batch[1]
            x = x.to(device)
            y = y.to(device)

            output = model.forward(x, mask=None)
            loss = loss_func(output, y)
            valid_loss += loss.sum().cpu().numpy()

    print("Validation loss :", valid_loss/len(data_loader))

def train(model, train_set, validation_set, epochs, learning_rate=0.1, batch_size=1, loss_func=nn.MSELoss(), device=DEVICE):

    model.to(device)
    optimizer = torch.optim.Adam(list(model.parameters()), lr=learning_rate)
    train_dataloader = DataLoader(train_set, batch_size=batch_size)
    valid_dataloader = DataLoader(validation_set, batch_size=batch_size)

    for i in range(epochs):
        #t0 = time.time()
        train_epoch(model, train_dataloader, optimizer, loss_func, device)
        valid_epoch(model, valid_dataloader, loss_func, device)
        #print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, n_units, dropout=0.1):              # n_heads, n_units=d_model, dropout=0.):
        """
        n_heads: the number of attention heads
        n_units: the number of input and output units
        dropout: probability of DROPPING units
        """
        super(MultiHeadedAttention, self).__init__()
        # This sets the size of the keys, values, and queries (self.d_k) to all
        # be equal to the number of output units divided by the number of heads.
        self.d_k = n_units // n_heads                              #  final dimension
        # This requires the number of n_heads to evenly divide n_units.
        assert n_units % n_heads == 0
        self.n_units = n_units
        self.n_heads = n_heads
        # TODO ========================
        # Create the layers below. self.linears should contain 3 linear
        # layers that compute the projection from n_units => n_heads x d_k
        # (one for each of query, key and value) plus an additional final layer
        # (4 in total)

        # Note: that parameters are initialized with Glorot initialization in
        # the make_model function below (so you don't need to implement this
        # yourself).

        # Note: the only Pytorch modules you are allowed to use are nn.Linear
        # and nn.Dropout. You can also use softmax, masked_fill and the "clones"
        # function we provide.
        self.linears = clones(nn.Linear(n_units, n_units), 4)
        self.dropout = nn.Dropout(p=dropout)
        

    def attention(self, query, key, value, mask=None, dropout=None):                      #### add a dropout = 0.1 ?
        # Implement scaled dot product attention
        # The query, key, and value inputs will be of size
        # batch_size x n_heads x seq_len x d_k
        # (If making a single call to attention in your forward method)
        # and mask (if not None) will be of size
        # batch_size x n_heads x seq_len x seq_len

        # As described in the .tex, apply input masking to the softmax
        # generating the "attention values" (i.e. A_i in the .tex)

        # Also apply dropout to the attention values.
        # This method needs to compare query and keys first, then mask positions
        # if a mask is provided, normalize the scores, apply dropout and then
        # retrieve values, in this particular order.
        # When applying the mask, use values -1e9 for the masked positions.
        # The method returns the result of the attention operation as well as
        # the normalized scores after dropout.

        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
             scores = scores.masked_fill(mask == 0, -1e9)
        norm_scores = F.softmax(scores, dim = -1)

        if dropout is not None:
           norm_scores =  dropout(norm_scores)     # Tensor of shape batch_size x n_heads x seq_len x seq_len
        output = torch.matmul(norm_scores, value)  # Tensor of shape batch_size x n_heads x seq_len x d_k

        return output, norm_scores


    def forward(self, query, key, value, mask=None):
        # Implement the masked multi-head attention.
        # query, key, and value correspond to Q, K, and V in the latex, and
        # they all have size: (batch_size, seq_len, self.n_units)
        # mask has size: (batch_size, seq_len, seq_len)
        # This method should call the attention method above
        # TODO ========================
        # 1) Do all the linear projections in batch from n_units => n_heads x d_k

        # 2) Apply attention on all the projected vectors in batch.
        # The query, key, value inputs to the attention method will be of size
        # batch_size x n_heads x seq_len x d_k

        # 3) "Concat" using a view and apply a final linear.
        #print('the code is in MultiheadAttention')
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        batch_size = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k               ### perform linear operation and split into h heads
        query, key, value = [l(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)  ### transpose to get dimensions bs * h * sl * d_model
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch.                   ### calculate attention using function we will define next
        x, _ = self.attention(query, key, value, mask=mask, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear.                          ### concatenate heads and put through final linear layer
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_k)
        return self.linears[-1](x)


#----------------------------------------------------------------------------------
# The encodings of elements of the input sequence

class WordEmbedding(nn.Module):
    def __init__(self, n_units, vocab):
        super(WordEmbedding, self).__init__()
        self.lut = nn.Embedding(vocab, n_units)
        self.n_units = n_units

    def forward(self, x):
        #print (x)
        #print('the code is in WordEmbdding')
        return self.lut(x) * math.sqrt(self.n_units)


class PositionalEncoding(nn.Module):
    def __init__(self, n_units, dropout, max_len=5000):                   #### d_model = n_units
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, n_units)                                  
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, n_units, 2).float() * -(math.log(10000.0) / n_units))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        #print('the code is in PositionalEncoding')
        return self.dropout(x)



#----------------------------------------------------------------------------------
# The TransformerBlock and the full Transformer

class TransformerBlock(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(TransformerBlock, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(ResidualSkipConnectionWithLayerNorm(size, dropout), 2)  #clones(ResidualSkipConnectionWithLayerNorm(size, dropout), 2)

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) # apply the self-attention
        #print('the code is in the transformer block')
        return self.sublayer[1](x, self.feed_forward) # apply the position-wise MLP


class TransformerStack(nn.Module):
    """
    This will be called on the TransformerBlock (above) to create a stack.
    """
    def __init__(self, layer, n_blocks): # layer will be TransformerBlock (below)
        super(TransformerStack, self).__init__()
        self.layers = clones(layer, n_blocks)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
            #print("--------------------")
            #print('the code is in the transformer stack')
            #print("--------------------")
        return self.norm(x)


class FullTransformer(nn.Module):
    def __init__(self, transformer_stack, embedding, n_units, vocab_size):
        super(FullTransformer, self).__init__()
        self.transformer_stack = transformer_stack
        self.embedding = embedding
        self.output_layer = nn.Linear(n_units, vocab_size)

    def forward(self, input_sequence, mask):
        #embeddings = self.embedding(input_sequence)
        #print('----------')
        #print(input_sequence.shape)
        #print('------------')
        return self.output_layer(self.transformer_stack(input_sequence, mask))  #F.log_softmax


def make_model(vocab_size, n_blocks=6, n_units=512, n_heads=16, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_heads, n_units)
    ff = MLP(n_units, dropout)
    position = PositionalEncoding(n_units, dropout)
    model = FullTransformer(
        transformer_stack=TransformerStack(TransformerBlock(n_units, c(attn), c(ff), dropout), n_blocks),
        embedding=nn.Sequential(WordEmbedding(n_units, vocab_size), c(position)),
        n_units=n_units,
        vocab_size=vocab_size
        )

    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


#----------------------------------------------------------------------------------
# Data processing

def subsequent_mask(size):
    """ helper function for creating the masks. """
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, x, pad=0):
        self.data = x
        self.mask = self.make_mask(self.data, pad)

    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."
        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


#----------------------------------------------------------------------------------
# Some standard modules

class LayerNorm(nn.Module):
    "layer normalization, as in: https://arxiv.org/abs/1607.06450"
    def __init__(self, n_units, eps=1e-6):       ##n_units                 ### (self, features, eps=1e-6)
        super(LayerNorm, self).__init__()                                   ### super().__init__()
        #self.size = n_units
        # create two learnable parameters to calibrate normalisation
        self.a_2 = nn.Parameter(torch.ones(n_units))
        self.b_2 = nn.Parameter(torch.zeros(n_units))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        #self.a_2.reshape(8, 4)
        #print(self.a_2.shape)
        #print(self.a_2) 
        #print(self.a_2.shape)    #### 32
        #print(x.shape)           #### torch.Size([5, 60, 4])
        #print(x) 
        #print(self.b_2.shape)    ### torch.Size([32])
        #print(self.b_2)
        #self.b_2.view(32, 1)
        #print((self.a_2 * ((x.view(len(A), -1) - mean)))
        #x.view(x.size(0), -1, 32)
        #print((x-mean).shape)
        #print(self.a_2.shape) 
        #print(self.b_2.shape)
        return self.a_2*(x - mean) / (std + self.eps) + self.b_2        ### x.mean(dim=-1, keepdim=True)   #self.a_2  #+ self.b_2
        #return  self.a_2 * (x - mean) / (std + self.eps) + self.b_2   #.reshape(self.a_2.shape(0), -1)
        #return self.a_2 * x - mean)))        ###/ (std + self.eps) + self.b_2.view(32, -1)


class ResidualSkipConnectionWithLayerNorm(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(ResidualSkipConnectionWithLayerNorm, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x))) 


class MLP(nn.Module):
    """
    This is just an MLP with 1 hidden layer
    """
    def __init__(self, n_units, dropout=0.1):
        super(MLP, self).__init__()
        self.w_1 = nn.Linear(n_units, 2048)
        self.w_2 = nn.Linear(2048, n_units)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [2]:
dataset = make_dataset(file_name='variable_Train_length(60)_dataset.csv', targets=DNA_ONE_HOT)

In [3]:
dataset

<torch.utils.data.dataset.TensorDataset at 0x7f286c4e4af0>

In [4]:
dataset
train_set, validation_set = split_dataset(dataset)

In [None]:
dataset = make_dataset(file_name='variable_Train_length(60)_dataset.csv', targets=DNA_ONE_HOT)
train_set, validation_set = split_dataset(dataset)
#print(train_set)
#print(validation_set)
#MultiHeadedAttention(3, 3, dropout=0.1)
model = make_model(1, n_blocks=4, n_units=4, n_heads=2, dropout=0.1)
#mlp_model = MLP(240, 1)
train(model.double(), train_set, validation_set, epochs=50, learning_rate=0.1, batch_size=5, loss_func=nn.MSELoss(), device=DEVICE)

  return F.mse_loss(input, target, reduction=self.reduction)


avg_train loss : 5.587548717807001
Validation loss : 5.353714745025054
avg_train loss : 5.34592988766706
Validation loss : 5.221790920041185
avg_train loss : 5.395738368942658
Validation loss : 5.289446068414807
avg_train loss : 5.387717590503836
Validation loss : 5.226348782181661
avg_train loss : 5.401153480166866
Validation loss : 5.2224646079185835


In [None]:
def clones(module, N):
    """
    A helper function for producing N identical layers (each with their own parameters).
    inputs:
        module: a pytorch nn.module
        N (int): the number of copies of that module to return
    returns:
        a ModuleList with the copies of the module (the ModuleList is itself also a module)
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
pip install tqdm

In [None]:
pip install dataloader

In [None]:
pip install random_split