In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import joblib
from sklearn.preprocessing import OneHotEncoder

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F

In [6]:
path_to_data = '../external_data/mixmhcpred/TableS2.txt'
# ignore the first row (header)
df = pd.read_csv(path_to_data, sep='\t', skiprows=1)
df['Peptide_Lengths'] = df['Peptide'].apply(len)
df = df[df['Peptide_Lengths'] == 9]

In [7]:
allele_counts = df['Allele'].value_counts()
# Create a mask to filter IDs that appear at least 10 times
mask = df['Allele'].map(allele_counts) >= 10
df = df[mask]

In [8]:
label_encoder = LabelEncoder()
# Fit the encoder on your class labels and transform them into numerical labels
y = label_encoder.fit_transform(df['Allele'].values)

In [9]:

amino_acids = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]


# create a mapping from amino-acids to integers
#stoi_peptide = { ch:i for i,ch in enumerate(amino_acids) }
#itos_peptide = { i:ch for i,ch in enumerate(amino_acids) }
#encode_peptide = lambda s: [stoi_peptide[c] for c in s] # encoder: take a string, output a list of integers
#decode_peptide = lambda l: ''.join([itos_peptide[i] for i in l]) # decoder: take a list of integers, output a string
#print(encode_peptide("AAAHTHRY"))
#print(decode_peptide(encode_peptide("AAAHTHRY")))
#X = np.array([encode_peptide(sequence) for sequence in df['Peptide'].values])

In [35]:
ninemer_encoder = OneHotEncoder()
X = np.array([list(s) for s in df['Peptide'].values])
X = ninemer_encoder.fit_transform(X).toarray()

In [73]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(X, dtype=torch.float)
print(data.shape, data.dtype)

ydata = torch.tensor(y, dtype = torch.float)

# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
train_y = ydata[:n].long()

val_data = data[n:]
val_y = ydata[n:].long()

torch.Size([235054, 180]) torch.float32


In [96]:
kmer = 9
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64 # how many independent sequences will we process in parallel?
max_iters = 1500
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
dropout = 0.5

# of categories
output_dim = df['Allele'].nunique()

In [97]:
print(len(data))

235054


In [111]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ypred = train_y if split == 'train' else val_y
   
    ix = torch.randint(len(data), (batch_size,))
    x = torch.stack([data[i] for i in ix])
    y = torch.stack([ypred[i] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [112]:
len(X[0])

180

In [113]:
n_embd = 20


class pMHCModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        
        # each amino-acid corresponds to a vector in 20 dimension space
        self.lm = nn.Linear(180, n_embd)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(n_embd)  # Add LayerNorm layer
        self.relu = nn.ReLU()
        self.lm_out = nn.Linear(n_embd, output_dim)
    
    def forward(self, x, targets=None):
        x = self.lm(x)
        x = self.layer_norm(x)  # Apply LayerNorm      
        x = self.dropout(x)
        x = self.relu(x)
        #x = self.ln_f(x)
        logits = self.lm_out(x) # (batch_size,num_classes)
        # targets should be (batch_size,)
        loss = F.cross_entropy(logits, targets)
        return logits, loss


In [114]:
model = pMHCModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e3, 'K parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.001)

6.159 K parameters


In [115]:
for iter in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.7753, val loss 4.7769
step 100: train loss 4.6145, val loss 5.1174
step 200: train loss 4.4896, val loss 5.3301
step 300: train loss 4.4126, val loss 5.4786
step 400: train loss 4.3523, val loss 5.6550
step 500: train loss 4.3137, val loss 5.7375
step 600: train loss 4.2763, val loss 5.8264
step 700: train loss 4.2313, val loss 5.9306
step 800: train loss 4.1796, val loss 6.0339
step 900: train loss 4.1589, val loss 6.1091
step 1000: train loss 4.0984, val loss 6.2543
step 1100: train loss 4.0539, val loss 6.3452
step 1200: train loss 4.0224, val loss 6.4650
step 1300: train loss 3.9622, val loss 6.5908
step 1400: train loss 3.9218, val loss 6.6636
step 1499: train loss 3.8881, val loss 6.7888
