# Building neural network
The main steps to create a neural network is to create the data loaders that will wrap around my dataset, and create the `class` that will represent the neural network.
The neural network has to include two main functions, `__init__` and `forward`.
For the DataLoader obejct there have to be 3 other functions, `init`, `len` and `getitem`.

In [1]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset, DataLoader
import os
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from datasets import Dataset
import math
from torch.optim import AdamW

# Useful paths

In [2]:
databases_path = "/mnt/sda1/Databases/"
# GWAS Catalog path
gwas_catalog_path = os.path.join(databases_path, "GWAS_Catalog_DATA/gwas_catalog_v1.0.2-associations_e110_r2023-09-25.tsv")
# Ensembl Variation path
ensembl_path = os.path.join(databases_path, "Ensembl/Variation/110/")
# Chromosomes' data path
chromosomes_path = os.path.join(ensembl_path, "chromosomes_data/")
# Reference genome path
ref_genome_path = os.path.join(databases_path,"Reference_Genome/GRCh38p14/Ensembl/Homo_sapiens_GRCh38_dna_primary_assembly.fa")
# GWAS Associated bed and sequences path
gwas_associated_bed_path = os.path.join(databases_path, "Ensembl/Variation/110/gwas_associated_sequences/beds")
gwas_associated_seq_path = os.path.join(databases_path, "Ensembl/Variation/110/gwas_associated_sequences/ref_sequences")
rand_bed_path = os.path.join(databases_path, "Ensembl/Variation/110/random_sequences/beds")
rand_seq_path = os.path.join(databases_path, "Ensembl/Variation/110/random_sequences/ref_sequences")
# Datasets path
dataset_path = os.path.join(ensembl_path, "chromosome_datasets/")

# Transform csv to dataloader
We will use the Hugging Face framework to go from a conventional Pandas dataframe to completely useful dataloaders for the neural network.

## Import chromosome 2 data

In [3]:
chr2_dataset = pd.read_csv(os.path.join(ensembl_path, 'to_dataloaders/chr2_dataset.csv'), index_col=0)
chr2_dataset.head()

Unnamed: 0,seq,label
0,TTCTTTTAAATGGCTACATAATTAAGTCTAAGGTGAGAATTACTGT...,0
1,AGTATTATTTCTATTCTTTACTTGAGAATCCAGTTTTGTAGGACTT...,0
2,ATTTTTTTTTTTCGAGTCAGAGTCTTGTTCTGTCACCCAGGCTGGA...,0
3,CTGCTTTGGGTTCTGCTGAAATCATGGATGAGTTCTTTCTTTAAAT...,0
4,ATTAATATTTACCCTGTATATTTTCACAGGACCATTATATTGATCA...,0


## Dataset and DataLoader

In [4]:
# Import the tokenizer and the data_collator for dynamic padding. Both from the HF transformers library
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Using the HuggingFace framework to create the Datasets and DataLoaders and split the data into train and test set
hf_dataset = Dataset.from_pandas(chr2_dataset).train_test_split(test_size=0.3)
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['seq', 'label', '__index_level_0__'],
        num_rows: 46102
    })
    test: Dataset({
        features: ['seq', 'label', '__index_level_0__'],
        num_rows: 19758
    })
})

In [5]:
def tokenize_function(example):
    return tokenizer(example["seq"], truncation=True)

tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/46102 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/19758 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['seq', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 46102
    })
    test: Dataset({
        features: ['seq', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 19758
    })
})

In [6]:
tokenized_datasets = tokenized_datasets.remove_columns(["seq", "__index_level_0__"])
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

{'train': ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['label', 'input_ids', 'token_type_ids', 'attention_mask']}

### Create the DataLoaders

In [7]:
train_set = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True, collate_fn=data_collator)
test_set = DataLoader(tokenized_datasets["test"], batch_size=8, shuffle=True, collate_fn=data_collator)

Inspect a batch from the DataLoaders:

In [8]:
for batch in train_set:
    break
{k: v.shape for k, v in batch.items()}

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 30]),
 'token_type_ids': torch.Size([8, 30]),
 'attention_mask': torch.Size([8, 30]),
 'labels': torch.Size([8])}

## Transformer encoder model

This will be only the transformer encoder

In [15]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, 1) # Changed the second parameter from `ntoken` to 2. Because we are aiming for sequences classificication between 2 classes

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src, src_mask) # Deleted parameter src_mask
        output = output.permute(1,0,2)
        output = self.linear(output)
        return output

In [16]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [18]:
ntokens = tokenizer.vocab_size  # size of vocabulary
emsize = 200  # embedding dimension
d_hid =  200 # Deafult:200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 5  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 5  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)



In [19]:
model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (embedding): Embedding(4096, 200)
  (linear): Linear(in_features=200, out_features=1, bias=True)
)

In [21]:
for batch, elements in enumerate(train_set):
    elements = {k: v.to(device) for k, v in elements.items()}
    print(batch, elements)
    break

0 {'input_ids': tensor([[   1,    5, 2725,   40,  128,   68,  686,   34,  197,  221,   35,  326,
           64, 2527,  174,  471,   45, 1110, 1902,  282,   60,  211,   73,  111,
           40,  145,   27,  187,    8,    2],
        [   1,    9,   10,   81,   32,  494,   30,  196,   20,  243,  110, 1252,
           84,  513,   79,   38,   60, 2221,   34,  319,  195, 1057,  823,  112,
         1470, 1198,  121,   40,    2,    3],
        [   1,    5, 3137,   23,  164,  163,  179,  252,   82,  547, 1739,   51,
           66, 1065,   52, 2098,  251,  103,   56,   51,  819,   36,  373,  402,
          804,   97, 1038,    2,    3,    3],
        [   1,  481,   78,  162, 2540,  221,   76,  242,   26,  137, 1132,  793,
           49,  111, 1720, 3893,   32,  192,  483,   44,   54,   79,  143,  222,
         2935,  113,   49,    2,    3,    3],
        [   1,  191, 1202,  449,  101,   88,  292, 3405,  500,   73,   80,  239,
           55,  549,   58,  531, 1773,  295,   65, 3678,   45,  260, 24

In [None]:
elements.items()

In [22]:
for batch, elements in enumerate(train_set):
    #elements = {k: v.to(device) for k, v in elements.items()}
    X, y = elements['input_ids'].to(device), elements['labels'].to(device)
    print(batch, (X,y))
    break

0 (tensor([[   1,    5, 1782,   92, 1290,   57,  252,  532, 1346,  104,  815, 3069,
          188,   36,  114,  374,  301,   51,   74,  190,  596, 3366,  654, 3094,
          385,   32,    2,    3,    3,    3],
        [   1,   83, 3802,  596,   27,  259,   46,   57, 1069,  101,   77,   76,
         1212,   97, 1905, 3934, 1255,   48,   74, 2768,  811,  165,   34,  998,
           34,  697,    6,    2,    3,    3],
        [   1,    9,   84, 1784,   81,   31, 1968,   29,  187,  654,  205,  962,
           26,  930,   96,   26,  516,  125,  746,   77,  107,   45,  153,   84,
           89,   42,   36,  281,    6,    2],
        [   1, 2791,  120,   28,  238,   39,  243,  426, 2672,  602,  336,   87,
          343,  227,   36,  104,  419,  327,  495,   87, 1115,   47,  847, 2545,
          278,    8,    2,    3,    3,    3],
        [   1,  136,  802,  570, 1333,   74,   85, 3117, 3066,   38,   36,  415,
          110,  559, 1502, 1215, 1797,  125,  842,  237,  125, 4031,   96,    8,
   

In [None]:
X.shape

In [None]:
i = 0

while i<1:
    for batch in test_set:
        #elements = {k: v.to(device) for k, v in elements.items()}
        print(batch)
        i += 1

## Training loop

### Optimizer and loss function

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.BCELoss()

In [None]:
logits = model(X)
logits

In [None]:
y

In [None]:
# Train Loop function

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, elements in enumerate(dataloader):
        X, y = elements['input_ids'].to(device), elements['labels'].to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

## Test Loop
We use this one to make sure the model is learning

In [None]:
# Test loop function

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, elements in dataloader:
            X, y = elements['input_ids'].to(device), elements['labels'].to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_set, model, loss_fn, optimizer)
    test(test_set, model, loss_fn)
print("Done!")

In [None]:
y

In [None]:
logits = model(X)

In [None]:
flatten = nn.Flatten()
flat_logits = flatten(logits)

In [None]:
flat_logits

# Trying to implemet with HuggingFace

In [None]:
from transformers import AutoModelForSequenceClassification, AutoModel

checkpoint = "zhihan1996/DNABERT-2-117M"
#model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, num_labels=2, trust_remote_code=True)
model.to(device)

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_set)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_set:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)