In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import joblib

In [2]:
path_to_data = '../external_data/mixmhcpred/TableS2.txt'
# ignore the first row (header)
df = pd.read_csv(path_to_data, sep='\t', skiprows=1)
df['Peptide_Lengths'] = df['Peptide'].apply(len)

In [4]:
df['PeptideShortened'] = df['Peptide'].apply(lambda x: x[:4] + x[-4:])

Make sure lengths are equally represented

In [5]:
def limit_rows(group):
    return group.head(10000)

df = df.groupby('Peptide_Lengths').apply(limit_rows).reset_index(drop=True)

In [None]:
allele_counts = df['Allele'].value_counts()
# Create a mask to filter IDs that appear at least 10 times
mask = df['Allele'].map(allele_counts) >= 10
df = df[mask]

In [6]:
encoder = OneHotEncoder()
X_encoded = np.array([list(s) for s in df['PeptideShortened'].values])
X_encoded = encoder.fit_transform(X_encoded).toarray()

label_encoder = LabelEncoder()
# Fit the encoder on your class labels and transform them into numerical labels
y_labeled = label_encoder.fit_transform(df['Allele'].values)

In [83]:
#from sklearn.utils.class_weight import compute_sample_weight
#classes_weights = compute_sample_weight(
#    class_weight='balanced',
#    y=y_labeled
#)

# calc

In [7]:
indices = np.arange(len(X_encoded))
X_train_encoded, X_test_encoded, y_train_labeled, y_test_labeled, indices_train, indices_test, = train_test_split(X_encoded, y_labeled, indices, test_size=0.1, random_state=1)

In [8]:
num_classes = len(np.unique(df['Allele']))
model_all = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    learning_rate=0.2,
    n_estimators=60,
    eval_metric = "mlogloss",
    alpha = 0.001
)

# Train the model while monitoring the evaluation metrics
model_all.fit(
    X_train_encoded, y_train_labeled,
    eval_set=[(X_train_encoded, y_train_labeled), (X_test_encoded, y_test_labeled)],  # Datasets for evaluation
    verbose=True
)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111], got [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 110 111 112]

In [9]:
def save_model_params(feature_encoder, label_encoder, my_model, file_path):
    joblib.dump(my_model, file_path + "_xgboost_model.joblib")
    joblib.dump(feature_encoder, file_path + '_feature_encoder.joblib')
    joblib.dump(label_encoder, file_path + '_label_encoder.joblib')

In [10]:
save_model_params(encoder, label_encoder, model_all,
                  "/Users/halasadi/code/pmhc_methods_tf/internal_data/2023_12_22_all_8mer_xgboost_model")

In [33]:
# Also need to save the test/train indices too?