# Preparation

## Import packages

In [1]:
# Inputs – from Nuo
import pickle
import torch
from torch import nn
import numpy as np
from torch.nn import Module

## Load traning and testing datasets

### Full-length transcripts

In [87]:
with open("proc/training_dataset.pkl", "rb") as f:
    training_dataset = pickle.load(f)

with open("proc/test_dataset.pkl", "rb") as f:
    test_dataset = pickle.load(f)

### 80nt transcripts

In [90]:
with open("proc/training_dataset_80nt.pkl", "rb") as f:
    training_dataset = pickle.load(f)

with open("proc/test_dataset_80nt.pkl", "rb") as f:
    test_dataset = pickle.load(f)

### Define NtDataset

In [99]:
# Dataset – from Hannah and Lucas

class NtDataset:
    """Nucleotide sequence + splice sites dataset."""
    def __init__(self, dataset):
        self.dataset = dataset
        self.map = {'A':0, 'G':1, 'C':2, 'T':3}
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        nt_seq = self.dataset[idx][0]
        strengths = self.dataset[idx][1].type(torch.LongTensor)
        
        tokenized_seq = []
        
        for letter in nt_seq:
            tokenized_seq.append(self.map[letter])
            
        return torch.tensor(tokenized_seq), torch.tensor(strengths)

## Model

In [92]:
# from torch.optim import optim

class SpliceFormer(nn.Module):
    """Transformer for splice site prediction"""

    def __init__(
        self,
        vocab_size:int,
        model_dim: int,
        n_attn_heads: int,
        n_encoder_layers: int,
        hidden_act: Module,
        dropout: float,
    ) -> None:

        super().__init__()

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=n_attn_heads,
            dim_feedforward=model_dim,
            dropout=dropout,
            activation=hidden_act,
            batch_first=True)

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim=model_dim)
        
        self.vocab_size = vocab_size

        self.encoder = nn.TransformerEncoder(
            encoder_layer=self.encoder_layer, num_layers=n_encoder_layers)

        self.out_layer = nn.Linear(in_features=model_dim, out_features=3, bias=False)
        # self.lm_head.weight = self.embedding.weight

    def forward(self, inputs):
        #self.vocab_size,
        x_emb = self.embedding(inputs)

        # inputs: (batch_size, seq_len, n_tokens)
        encoder_output = self.encoder(x_emb)
        outputs = self.out_layer(encoder_output)

        return outputs

# Training

In [93]:
training_dataset_subset = training_dataset[:100]

In [108]:
nucleotide_loader = NtDataset(training_dataset_subset)

In [110]:
type(nucleotide_loader)

__main__.NtDataset

In [None]:
nucleotide_loader

In [109]:
for seq_number, nucleotide_seq in enumerate(nucleotide_loader):
    inputs, labels = nucleotide_seq

AttributeError: 'numpy.ndarray' object has no attribute 'type'

In [98]:
# Training

n_epochs = 2
nucleotide_loader = NtDataset(training_dataset_subset)
loss_fn = nn.CrossEntropyLoss()
splice_model = SpliceFormer(vocab_size=4, 
                            model_dim=64, 
                            n_attn_heads=2, 
                            n_encoder_layers=2, 
                            hidden_act=nn.ReLU(), 
                            dropout=0.1)

optimizer = torch.optim.SGD(splice_model.parameters(), lr=0.0001, momentum=0.9)

# training loop 
for epoch in range(n_epochs):

    running_loss = 0.0
    for seq_number, nucleotide_seq in enumerate(nucleotide_loader):
        inputs, labels = nucleotide_seq

        optimizer.zero_grad()
        
        # error is here!
        outputs = splice_model(inputs)
        print(outputs)
        # print(labels)

        total_loss = loss_fn(outputs, labels)
        total_loss.backward()
        optimizer.step()

        running_loss += total_loss.item()
        if seq_number % 10 == 0:
            print(f'epoch: {epoch}, step: {seq_number}, loss: {running_loss}')
            torch.save(splice_model.state_dict(), f'./tbh_model_{seq_number}.pth')
            # running_loss = 0.0

            
print("Finished training!\nFinal loss value:", total_loss)
torch.save(splice_model.state_dict(), './tbh_model_final.pth')

tensor([[-1.5741e-01,  5.7796e-01,  3.6048e-01],
        [ 5.3551e-02,  5.4709e-01, -2.5090e-01],
        [ 1.1731e-01,  8.0545e-02, -2.5017e-01],
        [ 1.0592e+00, -3.9247e-02,  4.1319e-01],
        [ 1.0845e+00, -1.6636e-02,  3.6382e-01],
        [ 9.8309e-01,  3.0090e-02,  4.6850e-01],
        [ 1.0638e+00,  1.0154e-02,  6.0749e-01],
        [-1.7633e-01,  4.3838e-01, -4.7099e-01],
        [ 1.2359e+00,  1.4398e-01,  5.2272e-01],
        [ 9.8833e-02,  6.2473e-01, -2.7987e-01],
        [ 1.2760e-01,  4.7684e-01, -1.7856e-01],
        [ 1.0098e+00,  4.8979e-02,  3.7940e-01],
        [ 7.4366e-02,  2.0094e-01, -2.2009e-01],
        [-5.1121e-02, -1.8168e-02, -1.5731e-01],
        [ 9.0506e-02,  6.6663e-01, -2.2833e-01],
        [-1.6358e-01,  5.9888e-01, -2.6920e-01],
        [ 1.0553e-01,  7.4435e-01, -3.6213e-01],
        [-5.3668e-04,  5.2576e-01, -5.5156e-01],
        [-3.8144e-01,  6.6507e-01,  1.4273e-01],
        [-1.8656e-01,  4.3773e-01, -1.1563e-01],
        [-7.1222e-02

RuntimeError: expected scalar type Long but found Half

In [67]:
for seq_number, nucleotide_seq in enumerate(nucleotide_loader):
    inputs, labels = nucleotide_seq
    labels = labels.reshape(-1)
    # labels = labels.unsqueeze(0)
    break

outputs = splice_model(inputs)
labels = labels.type(torch.LongTensor)
# outputs = outputs.unsqueeze(0)
print(outputs.shape, labels.shape)
total_loss = loss_fn(outputs, labels)
print(total_loss)
# print(outputs.shape, labels.shape)

torch.Size([80, 3]) torch.Size([80])
tensor(1.2545, grad_fn=<NllLossBackward0>)


In [None]:
# Test

nucleotide_test_loader = NtDataset(test_dataset)
splice_test = SpliceFormer()
splice_test.load_state_dict(torch.load('./tbh_model_final.pth', weights_only = True)) 

outputs = splice_test.(nucleotide_seq)

