In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import joblib

In [4]:
path_to_data = '../external_data/mixmhcpred/TableS2.txt'
# ignore the first row (header)
df = pd.read_csv(path_to_data, sep='\t', skiprows=1)

In [5]:
allele_counts = df['Allele'].value_counts()
# Create a mask to filter IDs that appear at least 10 times
mask = df['Allele'].map(allele_counts) >= 10
df = df[mask]

In [6]:
df['PeptideShortened'] = df['Peptide'].apply(lambda x: x[:4] + x[-4:])

In [7]:
encoder = OneHotEncoder()
X_encoded = np.array([list(s) for s in df['PeptideShortened'].values])
X_encoded = encoder.fit_transform(X_encoded).toarray()

label_encoder = LabelEncoder()
# Fit the encoder on your class labels and transform them into numerical labels
y_labeled = label_encoder.fit_transform(df['Allele'].values)

In [8]:
X_train_encoded, X_test_encoded, y_train_labeled, y_test_labeled = train_test_split(X_encoded, y_labeled, test_size=0.1, random_state=1)

In [13]:
num_classes = len(np.unique(df['Allele']))
model_all = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    learning_rate=0.2,
    n_estimators=35,
    eval_metric = "mlogloss",
    booster = "gblinear"
)

# Train the model while monitoring the evaluation metrics
model_all.fit(
    X_train_encoded, y_train_labeled,
    eval_set=[(X_train_encoded, y_train_labeled), (X_test_encoded, y_test_labeled)],  # Datasets for evaluation
    verbose=True
)