### ...using LabelEncoder

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder


# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Define columns
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

# Label encode all features (basic encoding for simplicity)
encoders = {}
for col in feature_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Identify categorical columns (LightGBM handles them natively)
categorical_features = ['TRBV', 'TRBJ', 'MHC']

# LightGBM datasets
train_data = lgb.Dataset(train_df[feature_cols], label=train_df[target_col], categorical_feature=categorical_features)
valid_data = lgb.Dataset(valid_df[feature_cols], label=valid_df[target_col], reference=train_data, categorical_feature=categorical_features)

# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    # early_stopping_rounds=20
)
# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df[target_col]
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))

Accuracy: 0.6595288699964577
AUC: 0.4899016846583038
F1 Score: 0.18300820264354625
AP Score: 0.1583093947239425


### ...using Separate Label Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder


# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

seq_cols = ['TRB_CDR3', 'Epitope']
encoders = {}

for col in seq_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Convert categorical columns to category dtype
for col in ['TRBV', 'TRBJ', 'MHC']:
    train_df[col] = train_df[col].astype('category')
    valid_df[col] = valid_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# Now create feature matrices

feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
X_train = train_df[feature_cols]
X_valid = valid_df[feature_cols]
X_test = test_df[feature_cols]
y_train = train_df['Binding']
y_valid = valid_df['Binding']
y_test = test_df['Binding']

for col in ['TRBV', 'TRBJ', 'MHC']:
    for df in [train_df, valid_df, test_df]:
        df[col] = df[col].astype('category')


model = LGBMClassifier(n_estimators=1000, random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)],
    categorical_feature=['TRBV', 'TRBJ', 'MHC']
)

# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df['Binding']
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.391351
Accuracy: 0.8401346085724407
AUC: 0.5
F1 Score: 0.0
AP Score: 0.15986539142755934


### with task-wise evaluation

In [None]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')


# Ensure categorical columns are properly typed
for col in ['TRBV', 'TRBJ', 'MHC']:
    train_df[col] = train_df[col].astype('category')
    valid_df[col] = valid_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))
# Define feature columns and targets
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]

# for col in ['TRBV', 'TRBJ', 'MHC']:
#     for df in [train_df, valid_df, test_df]:
#         df[col] = df[col].astype('category')


# Fit the LightGBM model
model = LGBMClassifier(n_estimators=1000, random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)],
    categorical_feature=['TRBV', 'TRBJ', 'MHC']
)

# Predict probabilities and labels
y_pred_prob = model.predict_proba(X_valid)[:, 1]
y_pred_label = model.predict(X_valid)

# Overall metrics
print("=== Overall Validation Metrics ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_label))
print("Log Loss:", log_loss(y_valid, y_pred_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_pred_prob))



# --- Per Task Metrics ---
print("\n=== Per Task Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_pred_prob
valid_df_copy['pred_label'] = y_pred_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0,1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.391351
=== Overall Validation Metrics ===
Accuracy: 0.8416483463581367
Log Loss: 0.39135055242937095
ROC AUC: 0.7928649290044056

=== Per Task Metrics ===

Task: TPP1
  Accuracy: 0.8300
  Log Loss: 0.3340
  ROC AUC: 0.926526261887623

Task: TPP2
  Accuracy: 0.8538
  Log Loss: 0.4223
  ROC AUC: 0.7561215438942985

Task: TPP3
  Accuracy: 0.8000
  Log Loss: 0.5258
  ROC AUC: 0.

In [3]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Ensure categorical columns are properly typed
for col in ['TRBV', 'TRBJ', 'MHC']:
    for df in [train_df, valid_df, test_df]:
        df[col] = df[col].astype('category')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train LightGBM model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)],
    categorical_feature=['TRBV', 'TRBJ', 'MHC']
)

# === Validation Evaluation ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-task validation metrics ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Evaluation ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-task test metrics ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.391351
=== Overall Validation Metrics ===
Accuracy: 0.8416483463581367
Log Loss: 0.39135055242937095
ROC AUC: 0.7928649290044056
Average Precision: 0.4121502422654325

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8300
  Log Loss: 0.3340
  ROC AUC: 0.926526261887623
  Average Precision: 0.7928157

### using only TRC and EPITOPE (comparison with v1)

In [4]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load data
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)]
)

# === Validation Metrics ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-Task Validation ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Metrics ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-Task Test ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.432583
=== Overall Validation Metrics ===
Accuracy: 0.8416483463581367
Log Loss: 0.43258284878226305
ROC AUC: 0.6544983565520205
Average Precision: 0.22798046896800525

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8300
  Log Loss: 0.4231
  ROC AUC: 0.7624740791222908
  Average Precision: 0.45844087879593465

Task: TPP2
  Accuracy: 0.8538
  Log Loss: 0.4324
 