In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import joblib
from sklearn.preprocessing import OneHotEncoder

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F

In [3]:
path_to_data = '../../external_data/mixmhcpred/TableS2.txt'
# ignore the first row (header)
df = pd.read_csv(path_to_data, sep='\t', skiprows=1)
df['Peptide_Lengths'] = df['Peptide'].apply(len)
df = df[df['Peptide_Lengths'] == 9]

In [4]:
allele_counts = df['Allele'].value_counts()
# Create a mask to filter IDs that appear at least 10 times
mask = df['Allele'].map(allele_counts) >= 10
df = df[mask]

In [5]:
label_encoder = LabelEncoder()
# Fit the encoder on your class labels and transform them into numerical labels
y = label_encoder.fit_transform(df['Allele'].values)

In [6]:

amino_acids = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]


# create a mapping from amino-acids to integers
stoi_peptide = { ch:i for i,ch in enumerate(amino_acids) }
itos_peptide = { i:ch for i,ch in enumerate(amino_acids) }
encode_peptide = lambda s: [stoi_peptide[c] for c in s] # encoder: take a string, output a list of integers
decode_peptide = lambda l: ''.join([itos_peptide[i] for i in l]) # decoder: take a list of integers, output a string
print(encode_peptide("AAAHTHRY"))
print(decode_peptide(encode_peptide("AAAHTHRY")))
X = np.array([encode_peptide(sequence) for sequence in df['Peptide'].values])

[0, 0, 0, 8, 16, 8, 1, 18]
AAAHTHRY


In [15]:
type(X)

numpy.ndarray

In [35]:
#ninemer_encoder = OneHotEncoder()
#X = np.array([list(s) for s in df['Peptide'].values])
#X = ninemer_encoder.fit_transform(X).toarray()

In [16]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(X, dtype=torch.long)
print(data.shape, data.dtype)

ydata = torch.tensor(y, dtype = torch.float)

# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
train_y = ydata[:n].long()

val_data = data[n:]
val_y = ydata[n:].long()

torch.Size([235054, 9]) torch.int64


In [56]:
kmer = 9
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1000 # how many independent sequences will we process in parallel?
max_iters = 1500
eval_interval = 100
learning_rate = 1e-2
eval_iters = 200
dropout = 0.5

# of categories
output_dim = df['Allele'].nunique()

In [57]:
print(len(data))

235054


In [58]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ypred = train_y if split == 'train' else val_y
   
    ix = torch.randint(len(data), (batch_size,))
    x = torch.stack([data[i] for i in ix])
    y = torch.stack([ypred[i] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [59]:
len(X[0])

9

In [60]:
n_embd = 10
k = 9
n_hidden = 20

In [61]:
class pMHCModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(20, n_embd)
        self.lm_in = nn.Linear(n_embd * k, n_hidden)
        self.bn = nn.BatchNorm1d(n_hidden)
        self.relu = nn.ReLU()
        self.lm_out = nn.Linear(n_hidden, output_dim)

    def forward(self, idx, targets=None):
        x = self.token_embedding_table(idx)
        x = x.view(batch_size, n_embd * k)
        x = self.lm_in(x)
        x = self.bn(x)
        x = self.relu(x)
        logits = self.lm_out(x) 
        loss = F.cross_entropy(logits, targets)
        return logits, loss


In [62]:
model = pMHCModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e3, 'K parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.001)

4.559 K parameters


In [63]:
for iter in range(max_iters):

        # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.7968, val loss 4.6963
step 100: train loss 4.0082, val loss 5.5858
step 200: train loss 3.2418, val loss 6.1976
step 300: train loss 2.7248, val loss 6.8075
step 400: train loss 2.4223, val loss 7.4128


KeyboardInterrupt: 