In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import joblib

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F

In [2]:
path_to_data = '../external_data/mixmhcpred/TableS2.txt'
# ignore the first row (header)
df = pd.read_csv(path_to_data, sep='\t', skiprows=1)
df['Peptide_Lengths'] = df['Peptide'].apply(len)
df = df[df['Peptide_Lengths'] == 9]

In [None]:
allele_counts = df['Allele'].value_counts()
# Create a mask to filter IDs that appear at least 10 times
mask = df['Allele'].map(allele_counts) >= 10
df = df[mask]

In [44]:
label_encoder = LabelEncoder()
# Fit the encoder on your class labels and transform them into numerical labels
y = label_encoder.fit_transform(df['Allele'].values)

In [37]:

# create a mapping from amino-acids to integers
stoi_peptide = { ch:i for i,ch in enumerate(amino_acids) }
itos_peptide = { i:ch for i,ch in enumerate(amino_acids) }
encode_peptide = lambda s: [stoi_peptide[c] for c in s] # encoder: take a string, output a list of integers
decode_peptide = lambda l: ''.join([itos_peptide[i] for i in l]) # decoder: take a list of integers, output a string

print(encode_peptide("AAAHTHRY"))
print(decode_peptide(encode_peptide("AAAHTHRY")))

[0, 0, 0, 8, 16, 8, 1, 18]
AAAHTHRY


In [38]:
X = np.array([encode_peptide(sequence) for sequence in df['Peptide'].values])

In [39]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(X, dtype=torch.long)
print(data.shape, data.dtype)

ydata = torch.tensor(y, dtype = torch.float)

# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
train_y = ydata[:n].long()
val_y = ydata[n:].long()

torch.Size([235054, 9]) torch.int64


In [40]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ypred = train_y if split == 'train' else val_y
   
    ix = torch.randint(len(data), (batch_size,))
    x = torch.stack([data[i] for i in ix])
    y = torch.stack([ypred[i] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [157]:
kmer = 9

amino_acids = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]


device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64 # how many independent sequences will we process in parallel?
max_iters = 1500
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 20
dropout = 0.25
vocab_size = len(amino_acids)

# of categories
output_dim = df['Allele'].nunique()

In [158]:


class pMHCModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.lm = nn.Linear(vocab_size * kmer, n_embd)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, output_dim)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        x = self.token_embedding_table(idx) # (B,T,vocab_size)
        (B, T, C) = x.shape
        x = x.view(B, T * C)
        x = self.lm(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B,output_dim)

        loss = F.cross_entropy(logits, targets)
        return logits, loss


In [159]:
model = pMHCModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e3, 'K parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

6.559 K parameters


In [160]:
for iter in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.9527, val loss 4.9497
step 100: train loss 4.3074, val loss 5.7026
step 200: train loss 3.8623, val loss 6.5030
step 300: train loss 3.4362, val loss 6.9058
step 400: train loss 3.0968, val loss 7.4350
step 500: train loss 2.8609, val loss 7.7252
step 600: train loss 2.6479, val loss 7.8706
step 700: train loss 2.5191, val loss 8.3049
step 800: train loss 2.4341, val loss 8.3817
step 900: train loss 2.3564, val loss 8.6962
step 1000: train loss 2.2799, val loss 8.8762
step 1100: train loss 2.2695, val loss 9.2076
step 1200: train loss 2.2235, val loss 9.4407
step 1300: train loss 2.1667, val loss 9.4952
step 1400: train loss 2.1823, val loss 9.8066
step 1499: train loss 2.1790, val loss 9.9722
