### ...using LabelEncoder.  5 Features TRC, Epitope, TRBJ, TRBV, MHC (v2)

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder


# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Define columns
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

# Label encode all features (basic encoding for simplicity)
encoders = {}
for col in feature_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Identify categorical columns (LightGBM handles them natively)
categorical_features = ['TRBV', 'TRBJ', 'MHC']

# LightGBM datasets
train_data = lgb.Dataset(train_df[feature_cols], label=train_df[target_col], categorical_feature=categorical_features)
valid_data = lgb.Dataset(valid_df[feature_cols], label=valid_df[target_col], reference=train_data, categorical_feature=categorical_features)

# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    # early_stopping_rounds=20
)
# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df[target_col]
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))

Accuracy: 0.6595288699964577
AUC: 0.4899016846583038
F1 Score: 0.18300820264354625
AP Score: 0.1583093947239425


### ...using Separate Label Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder


# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

seq_cols = ['TRB_CDR3', 'Epitope']
encoders = {}

for col in seq_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Convert categorical columns to category dtype
for col in ['TRBV', 'TRBJ', 'MHC']:
    train_df[col] = train_df[col].astype('category')
    valid_df[col] = valid_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# Now create feature matrices

feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
X_train = train_df[feature_cols]
X_valid = valid_df[feature_cols]
X_test = test_df[feature_cols]
y_train = train_df['Binding']
y_valid = valid_df['Binding']
y_test = test_df['Binding']

for col in ['TRBV', 'TRBJ', 'MHC']:
    for df in [train_df, valid_df, test_df]:
        df[col] = df[col].astype('category')


model = LGBMClassifier(n_estimators=1000, random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)],
    categorical_feature=['TRBV', 'TRBJ', 'MHC']
)

# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df['Binding']
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.391351
Accuracy: 0.8401346085724407
AUC: 0.5
F1 Score: 0.0
AP Score: 0.15986539142755934


### with task-wise evaluation (v2)

In [3]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Ensure categorical columns are properly typed
for col in ['TRBV', 'TRBJ', 'MHC']:
    for df in [train_df, valid_df, test_df]:
        df[col] = df[col].astype('category')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train LightGBM model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)],
    categorical_feature=['TRBV', 'TRBJ', 'MHC']
)

# === Validation Evaluation ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-task validation metrics ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Evaluation ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-task test metrics ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.391351
=== Overall Validation Metrics ===
Accuracy: 0.8416483463581367
Log Loss: 0.39135055242937095
ROC AUC: 0.7928649290044056
Average Precision: 0.4121502422654325

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8300
  Log Loss: 0.3340
  ROC AUC: 0.926526261887623
  Average Precision: 0.7928157

### using only TCR and EPITOPE (comparison with v1)

In [4]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load data
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)]
)

# === Validation Metrics ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-Task Validation ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Metrics ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-Task Test ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


[LightGBM] [Info] Number of positive: 126463, number of negative: 623204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 749667, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.168692 -> initscore=-1.594924
[LightGBM] [Info] Start training from score -1.594924
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.432583
=== Overall Validation Metrics ===
Accuracy: 0.8416483463581367
Log Loss: 0.43258284878226305
ROC AUC: 0.6544983565520205
Average Precision: 0.22798046896800525

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8300
  Log Loss: 0.4231
  ROC AUC: 0.7624740791222908
  Average Precision: 0.45844087879593465

Task: TPP2
  Accuracy: 0.8538
  Log Loss: 0.4324
 

### NEW DATASETS (11.05.)

### 5 Features (v2)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Define columns
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

# Label encode all features (basic encoding for simplicity)
encoders = {}
for col in feature_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Identify categorical columns (LightGBM handles them natively)
categorical_features = ['TRBV', 'TRBJ', 'MHC']

# LightGBM datasets
train_data = lgb.Dataset(train_df[feature_cols], label=train_df[target_col], categorical_feature=categorical_features)
valid_data = lgb.Dataset(valid_df[feature_cols], label=valid_df[target_col], reference=train_data, categorical_feature=categorical_features)

# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    # early_stopping_rounds=20
)
# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df[target_col]


# === Overall Validation Evaluation ===
print("\n=== Overall Validation Metrics ===")
y_val_prob = model.predict(valid_df[feature_cols])
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print("Log Loss:", log_loss(y_val_true, y_val_prob))
print('Accuracy:', accuracy_score(y_val_true, y_val_pred))
print('AUC:', roc_auc_score(y_val_true, y_val_prob))
print('F1 Score:', f1_score(y_val_true, y_val_pred))
print('AP Score:', average_precision_score(y_val_true, y_val_prob))


# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n=== Per Task Validation Metrics ===")
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name}")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n=== Overall Test Metrics ===")
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))
print("Log Loss:", log_loss(y_true, y_pred))

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n=== Per Task Test Metrics ===")
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name}")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")



=== Overall Validation Metrics ===
Accuracy: 0.7587489902176413
AUC: 0.8721124023971237
F1 Score: 0.5362276127862162
AP Score: 0.6081711524532278
Log Loss: 0.7710147292014076

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.9300
  Log Loss: 0.20282006183958637
  ROC AUC: 0.9628357095681376
  Average Precision: 0.8416865374535609

Task: TPP2
  Accuracy: 0.6206
  Log Loss: 1.3140577712762866
  ROC AUC: 0.909488420236908
  Average Precision: 0.6961157347542468

Task: TPP3
  Accuracy: 0.5230
  Log Loss: 1.0703836113051035
  ROC AUC: 0.5560485936172912
  Average Precision: 0.22375528505626902

Task: TPP4
  Accuracy: 0.5600
  Log Loss: 1.2563158304713276
  ROC AUC: 0.6160714285714286
  Average Precision: 0.20596309324115109

=== Overall Test Metrics ===
Accuracy: 0.6231379677247739
AUC: 0.47886036925488296
F1 Score: 0.19923696481559983
AP Score: 0.1870805137103096
Log Loss: 1.2484113333004747

=== Per Task Test Metrics ===

Task: TPP1
  Accuracy: 0.8310
  Log Loss: 0.660080126

### with only TCR and Epitope (v1)(new datasets)

In [5]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load data
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)]
)

# === Validation Metrics ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-Task Validation ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Metrics ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-Task Test ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.457951
=== Overall Validation Metrics ===
Accuracy: 0.8257041942084191
Log Loss: 0.4579513818899797
ROC AUC: 0.5951871934281516
Average Precision: 0.2215396541033768

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8400
  Log Loss: 0.4160
  ROC AUC: 0.7054980177191659
  Average Precision: 0.3608687118824807

Task: TPP2
  Accuracy: 0.8116
  Log Loss: 0.4986
  ROC AUC: 0.4768861076839876
  Average Precision: 0.20269607444255933

Task: TPP3
  Accuracy: 0.8203
  Log Loss: 0.4771
  ROC AUC: 0.5251143783316623
  Average Precision: 0.19784586855051475

Task: TPP4
  Accuracy: 0.8400
  Log Loss: 0.4836
  ROC AUC: 0.373139880952381
  Average Precision: 0.13272558958390446

=== Overall Test Metrics ===
Accuracy: 0.7981690015960277
Log Loss: 0.5089030133540323
ROC AUC: 0.5416726294922185
Average Precision: 0.21877707342979233

=== Per Task Test Metrics ===



### same as before, but with new datasets

epitope tcr

In [1]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load data
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Encode high-cardinality object columns
for col in ['TRB_CDR3', 'Epitope']:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[col], valid_df[col], test_df[col]]).astype(str))
    for df in [train_df, valid_df, test_df]:
        df[col] = le.transform(df[col].astype(str))

# Define features and target
feature_cols = ['TRB_CDR3', 'Epitope']
target_col = 'Binding'

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Train model
model = LGBMClassifier(n_estimators=1000, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='binary_logloss',
    callbacks=[early_stopping(20), log_evaluation(50)]
)

# === Validation Metrics ===
print("=== Overall Validation Metrics ===")
y_val_prob = model.predict_proba(X_valid)[:, 1]
y_val_label = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_val_label))
print("Log Loss:", log_loss(y_valid, y_val_prob))
print("ROC AUC:", roc_auc_score(y_valid, y_val_prob))
print("Average Precision:", average_precision_score(y_valid, y_val_prob))

# === Per-Task Validation ===
print("\n=== Per Task Validation Metrics ===")
valid_df_copy = valid_df.copy()
valid_df_copy['true'] = y_valid
valid_df_copy['pred_prob'] = y_val_prob
valid_df_copy['pred_label'] = y_val_label

for task_name, group in valid_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")

# === Test Metrics ===
print("\n=== Overall Test Metrics ===")
y_test_prob = model.predict_proba(X_test)[:, 1]
y_test_label = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_label))
print("Log Loss:", log_loss(y_test, y_test_prob))
print("ROC AUC:", roc_auc_score(y_test, y_test_prob))
print("Average Precision:", average_precision_score(y_test, y_test_prob))

# === Per-Task Test ===
print("\n=== Per Task Test Metrics ===")
test_df_copy = test_df.copy()
test_df_copy['true'] = y_test
test_df_copy['pred_prob'] = y_test_prob
test_df_copy['pred_label'] = y_test_label

for task_name, group in test_df_copy.groupby('task'):
    acc = accuracy_score(group['true'], group['pred_label'])
    loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
    try:
        auc = roc_auc_score(group['true'], group['pred_prob'])
        ap = average_precision_score(group['true'], group['pred_prob'])
    except ValueError:
        auc = ap = "Undefined (only one class present)"
    
    print(f"\nTask: {task_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    print(f"  ROC AUC: {auc}")
    print(f"  Average Precision: {ap}")


  train_df = pd.read_csv(train_path, sep='\t')


[LightGBM] [Info] Number of positive: 126286, number of negative: 629472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 465
[LightGBM] [Info] Number of data points in the train set: 755758, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.167098 -> initscore=-1.606332
[LightGBM] [Info] Start training from score -1.606332
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.425872
=== Overall Validation Metrics ===
Accuracy: 0.8317625969508191
Log Loss: 0.4258722290803819
ROC AUC: 0.6806558997752716
Average Precision: 0.27078176695998446

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.8352
  Log Loss: 0.4154
  ROC AUC: 0.7160862301372765
  Average Precision: 0.3038405539509591

Task: TPP2
  Accuracy: 0.8135
  Log Loss: 0.4549
  

more features

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Load the TSV files
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

# Define columns
feature_cols = ['TRB_CDR3', 'Epitope', 'TRBV', 'TRBJ', 'MHC']
target_col = 'Binding'

# Label encode all features (basic encoding for simplicity)
encoders = {}
for col in feature_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    valid_df[col] = le.transform(valid_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

# Identify categorical columns (LightGBM handles them natively)
categorical_features = ['TRBV', 'TRBJ', 'MHC']

# LightGBM datasets
train_data = lgb.Dataset(train_df[feature_cols], label=train_df[target_col], categorical_feature=categorical_features)
valid_data = lgb.Dataset(valid_df[feature_cols], label=valid_df[target_col], reference=train_data, categorical_feature=categorical_features)

# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    # early_stopping_rounds=20
)
# Predict probabilities
y_pred = model.predict(test_df[feature_cols])

# Convert to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test_df[target_col]


# === Overall Validation Evaluation ===
print("\n=== Overall Validation Metrics ===")
y_val_prob = model.predict(valid_df[feature_cols])
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print("Log Loss:", log_loss(y_val_true, y_val_prob))
print('Accuracy:', accuracy_score(y_val_true, y_val_pred))
print('AUC:', roc_auc_score(y_val_true, y_val_prob))
print('F1 Score:', f1_score(y_val_true, y_val_pred))
print('AP Score:', average_precision_score(y_val_true, y_val_prob))


# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n=== Per Task Validation Metrics ===")
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name}")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n=== Overall Test Metrics ===")
print('Accuracy:', accuracy_score(y_true, y_pred_binary))
print('AUC:', roc_auc_score(y_true, y_pred))
print('F1 Score:', f1_score(y_true, y_pred_binary))
print('AP Score:', average_precision_score(y_true, y_pred))
print("Log Loss:", log_loss(y_true, y_pred))

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n=== Per Task Test Metrics ===")
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name}")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")


  train_df = pd.read_csv(train_path, sep='\t')



=== Overall Validation Metrics ===
Log Loss: 0.26782921524436815
Accuracy: 0.8965976252595709
AUC: 0.9294099109552929
F1 Score: 0.686121686660441
AP Score: 0.772210119750735

=== Per Task Validation Metrics ===

Task: TPP1
  Accuracy: 0.9210
  Log Loss: 0.21222189979624845
  ROC AUC: 0.9556435909630675
  Average Precision: 0.8349085443668809

Task: TPP2
  Accuracy: 0.8014
  Log Loss: 0.4490073083747943
  ROC AUC: 0.8497484874365699
  Average Precision: 0.5056873602318208

Task: TPP3
  Accuracy: 0.7635
  Log Loss: 0.6167057372193301
  ROC AUC: 0.5502887897635488
  Average Precision: 0.22116191363892912

Task: TPP4
  Accuracy: 0.7530
  Log Loss: 0.6357053078697638
  ROC AUC: 0.590989817665167
  Average Precision: 0.19836087089244198

=== Overall Test Metrics ===
Accuracy: 0.7975649410634446
AUC: 0.5194982716328727
F1 Score: 0.10166434369107158
AP Score: 0.17749155485417167
Log Loss: 0.8361440783918489

=== Per Task Test Metrics ===

Task: TPP1
  Accuracy: 0.8700
  Log Loss: 0.6770567999

## Embeddings Reduction

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.random_projection import GaussianRandomProjection
import time

def load_npz_embeddings(npz_path):
    """Load embeddings from .npz file"""
    print(f"Loading embeddings from {npz_path}...")
    data = np.load(npz_path)
    embeddings = {}
    for key in data.files:
        embeddings[key] = data[key]
    print(f"Loaded {len(embeddings)} sequences")
    return embeddings

def reduce_embeddings_mean_512(embeddings_dict):
    """
    Method 1: Mean pooling to get 1024 dims, then reduce to 512 using PCA
    """
    print("Applying mean pooling...")
    # First do mean pooling to get 1024 dimensions
    mean_embeddings = []
    seq_ids = []
    
    for seq_id, embedding in embeddings_dict.items():
        mean_embeddings.append(np.mean(embedding, axis=0))  # (1024,)
        seq_ids.append(seq_id)
    
    mean_embeddings = np.array(mean_embeddings)  # (n_sequences, 1024)
    print(f"Mean pooled embeddings shape: {mean_embeddings.shape}")
    
    # Use PCA to reduce from 1024 to 512
    print("Applying PCA to reduce to 512 dimensions...")
    pca = PCA(n_components=512, random_state=42)
    reduced_embeddings = pca.fit_transform(mean_embeddings)
    
    print(f"Explained variance ratio (first 10 components): {pca.explained_variance_ratio_[:10]}")
    print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.4f}")
    
    # Convert back to dictionary
    result_embeddings = {}
    for i, seq_id in enumerate(seq_ids):
        result_embeddings[seq_id] = reduced_embeddings[i]
    
    return result_embeddings, pca

def reduce_embeddings_random_projection_512(embeddings_dict):
    """
    Method 2: Mean pooling + Random Projection (faster than PCA)
    Good alternative when you want speed over optimal dimensionality reduction
    """
    print("Applying mean pooling...")
    # First do mean pooling to get 1024 dimensions
    mean_embeddings = []
    seq_ids = []
    
    for seq_id, embedding in embeddings_dict.items():
        mean_embeddings.append(np.mean(embedding, axis=0))
        seq_ids.append(seq_id)
    
    mean_embeddings = np.array(mean_embeddings)
    print(f"Mean pooled embeddings shape: {mean_embeddings.shape}")
    
    # Use Random Projection to reduce from 1024 to 512 (much faster than PCA)
    print("Applying Random Projection to reduce to 512 dimensions...")
    rp = GaussianRandomProjection(n_components=512, random_state=42)
    reduced_embeddings = rp.fit_transform(mean_embeddings)
    
    # Convert back to dictionary
    result_embeddings = {}
    for i, seq_id in enumerate(seq_ids):
        result_embeddings[seq_id] = reduced_embeddings[i]
    
    return result_embeddings, rp

def reduce_embeddings_truncate_512(embeddings_dict):
    """
    Method 3: Mean pooling + simple truncation (fastest)
    Just take the first 512 dimensions after mean pooling
    """
    print("Applying mean pooling and truncation to 512 dimensions...")
    
    result_embeddings = {}
    for seq_id, embedding in embeddings_dict.items():
        mean_emb = np.mean(embedding, axis=0)  # (1024,)
        truncated_emb = mean_emb[:512]  # Take first 512 dimensions
        result_embeddings[seq_id] = truncated_emb
    
    return result_embeddings

def reduce_embeddings_selective_pooling_512(embeddings_dict):
    """
    Method 4: Selective pooling - combine different pooling methods and reduce
    More sophisticated approach that might preserve more information
    """
    print("Applying selective pooling...")
    
    combined_embeddings = []
    seq_ids = []
    
    for seq_id, embedding in embeddings_dict.items():
        # Get different pooling representations
        mean_emb = np.mean(embedding, axis=0)      # (1024,)
        max_emb = np.max(embedding, axis=0)        # (1024,)
        
        # Take first 256 dims from each pooling method
        combined = np.concatenate([mean_emb[:256], max_emb[:256]])  # (512,)
        
        combined_embeddings.append(combined)
        seq_ids.append(seq_id)
    
    # Convert to dictionary
    result_embeddings = {}
    for i, seq_id in enumerate(seq_ids):
        result_embeddings[seq_id] = combined_embeddings[i]
    
    return result_embeddings

def benchmark_methods(embeddings_dict, sample_size=100):
    """
    Benchmark different reduction methods on speed and memory
    """
    print(f"\n=== BENCHMARKING METHODS (sample size: {sample_size}) ===")
    
    # Use a subset for benchmarking
    sample_keys = list(embeddings_dict.keys())[:sample_size]
    sample_embeddings = {k: embeddings_dict[k] for k in sample_keys}
    
    methods = [
        ("PCA", reduce_embeddings_mean_512),
        ("Random Projection", reduce_embeddings_random_projection_512),
        ("Truncation", reduce_embeddings_truncate_512),
        ("Selective Pooling", reduce_embeddings_selective_pooling_512)
    ]
    
    results = {}
    
    for method_name, method_func in methods:
        print(f"\nTesting {method_name}...")
        start_time = time.time()
        
        try:
            if method_name in ["PCA", "Random Projection"]:
                reduced, model = method_func(sample_embeddings)
            else:
                reduced = method_func(sample_embeddings)
            
            end_time = time.time()
            
            # Check output
            sample_key = list(reduced.keys())[0]
            output_shape = reduced[sample_key].shape
            
            results[method_name] = {
                'time': end_time - start_time,
                'output_shape': output_shape,
                'success': True
            }
            
            print(f"  Time: {end_time - start_time:.2f}s")
            print(f"  Output shape: {output_shape}")
            
        except Exception as e:
            results[method_name] = {
                'time': float('inf'),
                'output_shape': None,
                'success': False,
                'error': str(e)
            }
            print(f"  Error: {e}")
    
    # Print summary
    print(f"\n=== BENCHMARK SUMMARY ===")
    successful_methods = {k: v for k, v in results.items() if v['success']}
    if successful_methods:
        fastest = min(successful_methods.items(), key=lambda x: x[1]['time'])
        print(f"Fastest method: {fastest[0]} ({fastest[1]['time']:.2f}s)")
    
    return results

def process_embeddings_to_512(npz_path, output_path, method='pca', benchmark=False):
    """
    Complete pipeline to reduce embeddings to 512 dimensions
    
    Args:
        npz_path: Path to input .npz file
        output_path: Path to save reduced embeddings
        method: Reduction method ('pca', 'random', 'truncate', 'selective')
        benchmark: Whether to run benchmarking first
    """
    print(f"Processing embeddings from {npz_path}")
    embeddings = load_npz_embeddings(npz_path)
    
    if benchmark:
        benchmark_results = benchmark_methods(embeddings, sample_size=min(100, len(embeddings)))
    
    print(f"\nApplying {method} reduction to all {len(embeddings)} sequences...")
    start_time = time.time()
    
    if method == 'pca':
        reduced, model = reduce_embeddings_mean_512(embeddings)
        # Save the PCA model
        model_path = output_path.replace('.pkl', '_pca_model.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Saved PCA model to {model_path}")
        
    elif method == 'random':
        reduced, model = reduce_embeddings_random_projection_512(embeddings)
        # Save the random projection model
        model_path = output_path.replace('.pkl', '_rp_model.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Saved Random Projection model to {model_path}")
        
    elif method == 'truncate':
        reduced = reduce_embeddings_truncate_512(embeddings)
        
    elif method == 'selective':
        reduced = reduce_embeddings_selective_pooling_512(embeddings)
        
    else:
        raise ValueError(f"Unknown method: {method}")
    
    end_time = time.time()
    print(f"Reduction completed in {end_time - start_time:.2f} seconds")
    
    # Save reduced embeddings
    with open(output_path, 'wb') as f:
        pickle.dump(reduced, f)
    print(f"Saved reduced embeddings to {output_path}")
    
    # Print statistics
    sample_key = list(reduced.keys())[0]
    sample_embedding = reduced[sample_key]
    
    print(f"\n=== REDUCTION SUMMARY ===")
    print(f"Original dimension: 1024 (after mean pooling)")
    print(f"Reduced dimension: {len(sample_embedding)}")
    print(f"Compression ratio: {1024/len(sample_embedding):.1f}x")
    print(f"Memory reduction: {(1024-len(sample_embedding))/1024*100:.1f}%")
    print(f"Total sequences processed: {len(reduced)}")
    
    print(f"\nSample statistics for {sample_key}:")
    print(f"  Mean: {np.mean(sample_embedding):.4f}")
    print(f"  Std: {np.std(sample_embedding):.4f}")
    print(f"  Min: {np.min(sample_embedding):.4f}")
    print(f"  Max: {np.max(sample_embedding):.4f}")
    
    return reduced

# Example usage and recommendations
if __name__ == "__main__":
    
    print("=== TCR EMBEDDING REDUCTION TO 512D ===\n")
    
    # Method recommendations based on your priorities:
    print("Method recommendations:")
    print("- PCA: Best information preservation, slower")
    print("- Random Projection: Good balance of speed and quality")
    print("- Truncation: Fastest, may lose some information")
    print("- Selective: Good compromise, combines mean+max pooling")
    
    # Process TCR embeddings
    tcr_reduced = process_embeddings_to_512(
        npz_path="../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz",
        output_path="../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl",
        method='selective',  # Change to 'random', 'truncate', or 'selective' as needed
        benchmark=False  # Set to False to skip benchmarking
    )
    
    # Process Epitope embeddings (keep at 512 if you want consistency)
    print("\n" + "="*60)
    print("Processing Epitope embeddings...")
    
    epitope_reduced = process_embeddings_to_512(
        npz_path="../../../../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz",
        output_path="../../../../../data/embeddings/beta/allele/Epitope_reduced_512_pca.pkl",
        method='pca',
        benchmark=False  # Skip benchmarking for epitopes
    )
    
    print("\n=== PROCESSING COMPLETE ===")
    print("You can now use these 512-dimensional embeddings in your LightGBM model!")
    print("Expected speedup: ~2x faster training")
    print("Expected memory usage: ~50% less")

=== TCR EMBEDDING REDUCTION TO 512D ===

Method recommendations:
- PCA: Best information preservation, slower
- Random Projection: Good balance of speed and quality
- Truncation: Fastest, may lose some information
- Selective: Good compromise, combines mean+max pooling
Processing embeddings from ../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz
Loading embeddings from ../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz...
Loaded 211294 sequences

Applying selective reduction to all 211294 sequences...
Applying selective pooling...
Reduction completed in 5.83 seconds
Saved reduced embeddings to ../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl

=== REDUCTION SUMMARY ===
Original dimension: 1024 (after mean pooling)
Reduced dimension: 512
Compression ratio: 2.0x
Memory reduction: 50.0%
Total sequences processed: 211294

Sample statistics for CASSWRDGATGELFF:
  Mean: 0.1225
  Std: 0.1699
  Min: -0.2247
  Max: 0.6312

Processing Epitope embe

### Train and Test LightGBM
v2

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your actual paths
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_pca.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='zero'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean', 'drop')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
            else:  # 'drop' - will be handled later
                embedding_features.append(np.zeros(embedding_dim))  # placeholder
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        if len(missing_sequences) <= 5:  # Show a few examples
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Encode categorical features that don't have embeddings
print("\nEncoding categorical features...")
categorical_cols = ['TRBV', 'TRBJ', 'MHC']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col + '_encoded'] = le.transform(train_df[col].astype(str))
    valid_df[col + '_encoded'] = le.transform(valid_df[col].astype(str))
    test_df[col + '_encoded'] = le.transform(test_df[col].astype(str))
    encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Combine all features
print("\nCombining features...")
encoded_categorical_cols = [col + '_encoded' for col in categorical_cols]

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb, 
    train_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb, 
    valid_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb, 
    test_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features") 
print(f"  - Categorical features: {len(encoded_categorical_cols)} features")

target_col = 'Binding'

# Check for any NaN values
if train_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in training features")
    train_features = train_features.fillna(0)
if valid_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in validation features")
    valid_features = valid_features.fillna(0)
if test_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in test features")
    test_features = test_features.fillna(0)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
train_data = lgb.Dataset(
    train_features, 
    label=train_df[target_col], 
    categorical_feature=encoded_categorical_cols
)
valid_data = lgb.Dataset(
    valid_features, 
    label=valid_df[target_col], 
    reference=train_data, 
    categorical_feature=encoded_categorical_cols
)

# LightGBM parameters - optimized for high-dimensional embedding features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,  # Lower learning rate for stability with embeddings
    'feature_fraction': 0.8,  # Feature subsampling to prevent overfitting
    'bagging_fraction': 0.8,  # Data subsampling
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*50)
print("VALIDATION METRICS")
print("="*50)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*30)
    print("PER-TASK VALIDATION METRICS")
    print("="*30)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*50)
print("TEST METRICS")
print("="*50)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*30)
    print("PER-TASK TEST METRICS")
    print("="*30)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
categorical_importance = feature_imp_df[feature_imp_df['feature'].str.endswith('_encoded')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance + categorical_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")
print(f"Categorical features: {categorical_importance:.2f} ({categorical_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# === Model Performance Summary ===
print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"Embedding contribution: {(tcr_emb_importance + epitope_emb_importance)/total_importance*100:.1f}%")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_model.txt'")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_pca.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Encoding categorical features...
Encoded TRBV: 166 unique values
Encoded TRBJ: 31 unique values
Encoded MHC: 99 unique values

Combining features...
Final feature dimensions: 1027 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - Categorical features: 3 features

Creating LightGBM datasets...

Training LightGBM model...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbo

### Train and Test using only TCR and Epitope (v1)

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your 512-dimensional embeddings
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_pca.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='mean'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        # Show a few examples of missing sequences
        if len(missing_sequences) <= 5:
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
        elif len(missing_sequences) > 5:
            print(f"  First few missing: {[seq for _, seq in missing_sequences[:3]]}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Combine ONLY TCR and Epitope features (no categorical features)
print("\nCombining TCR and Epitope features only...")

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features")
print(f"  - No categorical features used")

target_col = 'Binding'

# Check for any NaN values
for name, features in [("train", train_features), ("valid", valid_features), ("test", test_features)]:
    nan_count = features.isnull().sum().sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} NaN values in {name} features - filling with 0")
        features.fillna(0, inplace=True)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
# Note: No categorical features since we're only using embeddings
train_data = lgb.Dataset(train_features, label=train_df[target_col])
valid_data = lgb.Dataset(valid_features, label=valid_df[target_col], reference=train_data)

# LightGBM parameters - optimized for embedding-only features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Feature subsampling
    'bagging_fraction': 0.8,  # Data subsampling  
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model (TCR + Epitope embeddings only)...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*60)
print("VALIDATION METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*40)
    print("PER-TASK VALIDATION METRICS")
    print("="*40)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })
            
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*60)
print("TEST METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*40)
    print("PER-TASK TEST METRICS")
    print("="*40)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })
            
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS (Embeddings Only)")
print("="*60)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# Check if embeddings are well-distributed in importance
tcr_top_5_importance = tcr_features['importance'].sum()
epitope_top_5_importance = epitope_features['importance'].sum()

print(f"\nImportance concentration:")
print(f"Top 5 TCR dims contribute: {tcr_top_5_importance/tcr_emb_importance*100:.1f}% of TCR importance")
print(f"Top 5 Epitope dims contribute: {epitope_top_5_importance/epitope_emb_importance*100:.1f}% of Epitope importance")

# === Model Performance Summary ===
print("\n" + "="*60)
print("PERFORMANCE SUMMARY (Embeddings Only)")
print("="*60)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"TCR embedding contribution: {tcr_emb_importance/total_importance*100:.1f}%")
print(f"Epitope embedding contribution: {epitope_emb_importance/total_importance*100:.1f}%")

# Performance comparison hint
print(f"\nModel uses ONLY sequence embeddings (no MHC, TRBV, TRBJ)")
print(f"This shows the predictive power of TCR-Epitope interaction alone")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_only_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_only_model.txt'")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_pca.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_pca.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Combining TCR and Epitope features only...
Final feature dimensions: 1024 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - No categorical features used

Creating LightGBM datasets...

Training LightGBM model (TCR + Epitope embeddings only)...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbosity': -1, 'seed': 42, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fract

I just noticed the embeddings reduction was made differently for tcr than epitope.
That's not necessarely a mistake, but for the record, we generate both reductions the same way and train and test the models LightGBM-v1 and LightGBM-v2 again. We keep the results above for comparisson. 

## embeddings reduction

In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.random_projection import GaussianRandomProjection
import time

def load_npz_embeddings(npz_path):
    """Load embeddings from .npz file"""
    print(f"Loading embeddings from {npz_path}...")
    data = np.load(npz_path)
    embeddings = {}
    for key in data.files:
        embeddings[key] = data[key]
    print(f"Loaded {len(embeddings)} sequences")
    return embeddings

def reduce_embeddings_selective_pooling_512(embeddings_dict):
    """
    Method 4: Selective pooling - combine different pooling methods and reduce
    More sophisticated approach that might preserve more information
    """
    print("Applying selective pooling...")
    
    combined_embeddings = []
    seq_ids = []
    
    for seq_id, embedding in embeddings_dict.items():
        # Get different pooling representations
        mean_emb = np.mean(embedding, axis=0)      # (1024,)
        max_emb = np.max(embedding, axis=0)        # (1024,)
        
        # Take first 256 dims from each pooling method
        combined = np.concatenate([mean_emb[:256], max_emb[:256]])  # (512,)
        
        combined_embeddings.append(combined)
        seq_ids.append(seq_id)
    
    # Convert to dictionary
    result_embeddings = {}
    for i, seq_id in enumerate(seq_ids):
        result_embeddings[seq_id] = combined_embeddings[i]
    
    return result_embeddings


def process_embeddings_to_512(npz_path, output_path, method='selective', benchmark=False):
    """
    Complete pipeline to reduce embeddings to 512 dimensions
    
    Args:
        npz_path: Path to input .npz file
        output_path: Path to save reduced embeddings
        method: Reduction method ('pca', 'random', 'truncate', 'selective')
        benchmark: Whether to run benchmarking first
    """
    print(f"Processing embeddings from {npz_path}")
    embeddings = load_npz_embeddings(npz_path)
    
    if benchmark:
        benchmark_results = benchmark_methods(embeddings, sample_size=min(100, len(embeddings)))
    
    print(f"\nApplying {method} reduction to all {len(embeddings)} sequences...")
    start_time = time.time()
    
    if method == 'pca':
        reduced, model = reduce_embeddings_mean_512(embeddings)
        # Save the PCA model
        model_path = output_path.replace('.pkl', '_pca_model.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Saved PCA model to {model_path}")
        
    elif method == 'random':
        reduced, model = reduce_embeddings_random_projection_512(embeddings)
        # Save the random projection model
        model_path = output_path.replace('.pkl', '_rp_model.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Saved Random Projection model to {model_path}")
        
    elif method == 'truncate':
        reduced = reduce_embeddings_truncate_512(embeddings)
        
    elif method == 'selective':
        reduced = reduce_embeddings_selective_pooling_512(embeddings)
        
    else:
        raise ValueError(f"Unknown method: {method}")
    
    end_time = time.time()
    print(f"Reduction completed in {end_time - start_time:.2f} seconds")
    
    # Save reduced embeddings
    with open(output_path, 'wb') as f:
        pickle.dump(reduced, f)
    print(f"Saved reduced embeddings to {output_path}")
    
    # Print statistics
    sample_key = list(reduced.keys())[0]
    sample_embedding = reduced[sample_key]
    
    print(f"\n=== REDUCTION SUMMARY ===")
    print(f"Original dimension: 1024 (after mean pooling)")
    print(f"Reduced dimension: {len(sample_embedding)}")
    print(f"Compression ratio: {1024/len(sample_embedding):.1f}x")
    print(f"Memory reduction: {(1024-len(sample_embedding))/1024*100:.1f}%")
    print(f"Total sequences processed: {len(reduced)}")
    
    print(f"\nSample statistics for {sample_key}:")
    print(f"  Mean: {np.mean(sample_embedding):.4f}")
    print(f"  Std: {np.std(sample_embedding):.4f}")
    print(f"  Min: {np.min(sample_embedding):.4f}")
    print(f"  Max: {np.max(sample_embedding):.4f}")
    
    return reduced

# Example usage and recommendations
if __name__ == "__main__":
    
    print("=== TCR EMBEDDING REDUCTION TO 512D ===\n")
    
    print("- Selective: Good compromise, combines mean+max pooling")
    
    # Process TCR embeddings
    tcr_reduced = process_embeddings_to_512(
        npz_path="../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz",
        output_path="../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl",
        method='selective',  # Change to 'random', 'truncate', or 'selective' as needed
        benchmark=False  # Set to False to skip benchmarking
    )
    
    # Process Epitope embeddings (keep at 512 if you want consistency)
    print("\n" + "="*60)
    print("Processing Epitope embeddings...")
    
    epitope_reduced = process_embeddings_to_512(
        npz_path="../../../../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz",
        output_path="../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl",
        method='selective',
        benchmark=False  # Skip benchmarking for epitopes
    )
    
    print("\n=== PROCESSING COMPLETE ===")
    print("You can now use these 512-dimensional embeddings in your LightGBM model!")
    print("Expected speedup: ~2x faster training")
    print("Expected memory usage: ~50% less")

=== TCR EMBEDDING REDUCTION TO 512D ===

- Selective: Good compromise, combines mean+max pooling
Processing embeddings from ../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz
Loading embeddings from ../../../../../data/embeddings/beta/allele/TRB_beta_embeddings.npz...
Loaded 211294 sequences

Applying selective reduction to all 211294 sequences...
Applying selective pooling...
Reduction completed in 11.60 seconds
Saved reduced embeddings to ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl

=== REDUCTION SUMMARY ===
Original dimension: 1024 (after mean pooling)
Reduced dimension: 512
Compression ratio: 2.0x
Memory reduction: 50.0%
Total sequences processed: 211294

Sample statistics for CASSWRDGATGELFF:
  Mean: 0.1225
  Std: 0.1699
  Min: -0.2247
  Max: 0.6312

Processing Epitope embeddings...
Processing embeddings from ../../../../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz
Loading embeddings from ../../../../../data/embeddings/beta/a

## v1

In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your 512-dimensional embeddings
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='mean'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        # Show a few examples of missing sequences
        if len(missing_sequences) <= 5:
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
        elif len(missing_sequences) > 5:
            print(f"  First few missing: {[seq for _, seq in missing_sequences[:3]]}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Combine ONLY TCR and Epitope features (no categorical features)
print("\nCombining TCR and Epitope features only...")

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features")
print(f"  - No categorical features used")

target_col = 'Binding'

# Check for any NaN values
for name, features in [("train", train_features), ("valid", valid_features), ("test", test_features)]:
    nan_count = features.isnull().sum().sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} NaN values in {name} features - filling with 0")
        features.fillna(0, inplace=True)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
# Note: No categorical features since we're only using embeddings
train_data = lgb.Dataset(train_features, label=train_df[target_col])
valid_data = lgb.Dataset(valid_features, label=valid_df[target_col], reference=train_data)

# LightGBM parameters - optimized for embedding-only features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Feature subsampling
    'bagging_fraction': 0.8,  # Data subsampling  
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model (TCR + Epitope embeddings only)...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*60)
print("VALIDATION METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*40)
    print("PER-TASK VALIDATION METRICS")
    print("="*40)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })
            
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*60)
print("TEST METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*40)
    print("PER-TASK TEST METRICS")
    print("="*40)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })
            
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS (Embeddings Only)")
print("="*60)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# Check if embeddings are well-distributed in importance
tcr_top_5_importance = tcr_features['importance'].sum()
epitope_top_5_importance = epitope_features['importance'].sum()

print(f"\nImportance concentration:")
print(f"Top 5 TCR dims contribute: {tcr_top_5_importance/tcr_emb_importance*100:.1f}% of TCR importance")
print(f"Top 5 Epitope dims contribute: {epitope_top_5_importance/epitope_emb_importance*100:.1f}% of Epitope importance")

# === Model Performance Summary ===
print("\n" + "="*60)
print("PERFORMANCE SUMMARY (Embeddings Only)")
print("="*60)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"TCR embedding contribution: {tcr_emb_importance/total_importance*100:.1f}%")
print(f"Epitope embedding contribution: {epitope_emb_importance/total_importance*100:.1f}%")

# Performance comparison hint
print(f"\nModel uses ONLY sequence embeddings (no MHC, TRBV, TRBJ)")
print(f"This shows the predictive power of TCR-Epitope interaction alone")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_only_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_only_model.txt'")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Combining TCR and Epitope features only...
Final feature dimensions: 1024 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - No categorical features used

Creating LightGBM datasets...

Training LightGBM model (TCR + Epitope embeddings only)...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbosity': -1, 'seed': 42, 'num_leaves': 31, 'learning_rate': 0.05, 'feature

## v2

In [5]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your actual paths
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='zero'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean', 'drop')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
            else:  # 'drop' - will be handled later
                embedding_features.append(np.zeros(embedding_dim))  # placeholder
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        if len(missing_sequences) <= 5:  # Show a few examples
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Encode categorical features that don't have embeddings
print("\nEncoding categorical features...")
categorical_cols = ['TRBV', 'TRBJ', 'MHC']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col + '_encoded'] = le.transform(train_df[col].astype(str))
    valid_df[col + '_encoded'] = le.transform(valid_df[col].astype(str))
    test_df[col + '_encoded'] = le.transform(test_df[col].astype(str))
    encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Combine all features
print("\nCombining features...")
encoded_categorical_cols = [col + '_encoded' for col in categorical_cols]

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb, 
    train_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb, 
    valid_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb, 
    test_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features") 
print(f"  - Categorical features: {len(encoded_categorical_cols)} features")

target_col = 'Binding'

# Check for any NaN values
if train_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in training features")
    train_features = train_features.fillna(0)
if valid_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in validation features")
    valid_features = valid_features.fillna(0)
if test_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in test features")
    test_features = test_features.fillna(0)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
train_data = lgb.Dataset(
    train_features, 
    label=train_df[target_col], 
    categorical_feature=encoded_categorical_cols
)
valid_data = lgb.Dataset(
    valid_features, 
    label=valid_df[target_col], 
    reference=train_data, 
    categorical_feature=encoded_categorical_cols
)

# LightGBM parameters - optimized for high-dimensional embedding features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,  # Lower learning rate for stability with embeddings
    'feature_fraction': 0.8,  # Feature subsampling to prevent overfitting
    'bagging_fraction': 0.8,  # Data subsampling
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*50)
print("VALIDATION METRICS")
print("="*50)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*30)
    print("PER-TASK VALIDATION METRICS")
    print("="*30)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*50)
print("TEST METRICS")
print("="*50)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*30)
    print("PER-TASK TEST METRICS")
    print("="*30)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
        except ValueError:
            loss = auc = ap = "Undefined (only one class present)"

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss}")
        print(f"  ROC AUC: {auc}")
        print(f"  Average Precision: {ap}")
else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
categorical_importance = feature_imp_df[feature_imp_df['feature'].str.endswith('_encoded')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance + categorical_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")
print(f"Categorical features: {categorical_importance:.2f} ({categorical_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# === Model Performance Summary ===
print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"Embedding contribution: {(tcr_emb_importance + epitope_emb_importance)/total_importance*100:.1f}%")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_model.txt'")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Encoding categorical features...
Encoded TRBV: 166 unique values
Encoded TRBJ: 31 unique values
Encoded MHC: 99 unique values

Combining features...
Final feature dimensions: 1027 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - Categorical features: 3 features

Creating LightGBM datasets...

Training LightGBM model...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 

### Again v1 - now with macro-f1 and rounded metrics

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.metrics import log_loss, f1_score

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your 512-dimensional embeddings
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='mean'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        # Show a few examples of missing sequences
        if len(missing_sequences) <= 5:
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
        elif len(missing_sequences) > 5:
            print(f"  First few missing: {[seq for _, seq in missing_sequences[:3]]}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Combine ONLY TCR and Epitope features (no categorical features)
print("\nCombining TCR and Epitope features only...")

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features")
print(f"  - No categorical features used")

target_col = 'Binding'

# Check for any NaN values
for name, features in [("train", train_features), ("valid", valid_features), ("test", test_features)]:
    nan_count = features.isnull().sum().sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} NaN values in {name} features - filling with 0")
        features.fillna(0, inplace=True)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
# Note: No categorical features since we're only using embeddings
train_data = lgb.Dataset(train_features, label=train_df[target_col])
valid_data = lgb.Dataset(valid_features, label=valid_df[target_col], reference=train_data)

# LightGBM parameters - optimized for embedding-only features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Feature subsampling
    'bagging_fraction': 0.8,  # Data subsampling  
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model (TCR + Epitope embeddings only)...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*60)
print("VALIDATION METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'Macro-F1 Score: {f1_score(y_val_true, y_val_pred, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*40)
    print("PER-TASK VALIDATION METRICS")
    print("="*40)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = f1 = macro_f1 = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        avg_f1 = np.mean([r['f1'] for r in valid_tasks])
        avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")
        print(f"  Average F1: {avg_f1:.4f}")
        print(f"  Average Macro-F1: {avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*60)
print("TEST METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'Macro-F1 Score: {f1_score(y_true, y_pred_binary, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*40)
    print("PER-TASK TEST METRICS")
    print("="*40)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = f1 = macro_f1 = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        test_avg_f1 = np.mean([r['f1'] for r in valid_test_tasks])
        test_avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")
        print(f"  Average F1: {test_avg_f1:.4f}")
        print(f"  Average Macro-F1: {test_avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS (Embeddings Only)")
print("="*60)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# Check if embeddings are well-distributed in importance
tcr_top_5_importance = tcr_features['importance'].sum()
epitope_top_5_importance = epitope_features['importance'].sum()

print(f"\nImportance concentration:")
print(f"Top 5 TCR dims contribute: {tcr_top_5_importance/tcr_emb_importance*100:.1f}% of TCR importance")
print(f"Top 5 Epitope dims contribute: {epitope_top_5_importance/epitope_emb_importance*100:.1f}% of Epitope importance")

# === Model Performance Summary ===
print("\n" + "="*60)
print("PERFORMANCE SUMMARY (Embeddings Only)")
print("="*60)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"TCR embedding contribution: {tcr_emb_importance/total_importance*100:.1f}%")
print(f"Epitope embedding contribution: {epitope_emb_importance/total_importance*100:.1f}%")

# Performance comparison hint
print(f"\nModel uses ONLY sequence embeddings (no MHC, TRBV, TRBJ)")
print(f"This shows the predictive power of TCR-Epitope interaction alone")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_only_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_only_model.txt'")

# Add this diagnostic code after making predictions to understand the F1=0 issue

print("\n" + "="*60)
print("DIAGNOSTIC ANALYSIS FOR F1 = 0")
print("="*60)

# 1. Check class distribution
print("1. CLASS DISTRIBUTION:")
print(f"Training set:")
train_class_dist = train_df[target_col].value_counts(normalize=True)
print(train_class_dist)
print(f"Validation set:")
val_class_dist = valid_df[target_col].value_counts(normalize=True)
print(val_class_dist)
print(f"Test set:")
test_class_dist = test_df[target_col].value_counts(normalize=True)
print(test_class_dist)

# 2. Check prediction probabilities distribution
print(f"\n2. PREDICTION PROBABILITY DISTRIBUTION:")
print(f"Validation predictions - Min: {y_val_prob.min():.4f}, Max: {y_val_prob.max():.4f}, Mean: {y_val_prob.mean():.4f}")
print(f"Test predictions - Min: {y_pred.min():.4f}, Max: {y_pred.max():.4f}, Mean: {y_pred.mean():.4f}")

# 3. Check how many predictions are above different thresholds
print(f"\n3. PREDICTIONS ABOVE DIFFERENT THRESHOLDS:")
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for thresh in thresholds:
    val_above = (y_val_prob > thresh).sum()
    test_above = (y_pred > thresh).sum()
    print(f"Threshold {thresh}: Validation={val_above}/{len(y_val_prob)} ({val_above/len(y_val_prob)*100:.1f}%), Test={test_above}/{len(y_pred)} ({test_above/len(y_pred)*100:.1f}%)")

# 4. Check confusion matrix with current threshold
from sklearn.metrics import confusion_matrix
print(f"\n4. CONFUSION MATRIX (threshold=0.5):")
print("Validation:")
val_cm = confusion_matrix(y_val_true, y_val_pred)
print(f"TN: {val_cm[0,0]}, FP: {val_cm[0,1]}")
print(f"FN: {val_cm[1,0]}, TP: {val_cm[1,1]}")

print("Test:")
test_cm = confusion_matrix(y_true, y_pred_binary)
print(f"TN: {test_cm[0,0]}, FP: {test_cm[0,1]}")
print(f"FN: {test_cm[1,0]}, TP: {test_cm[1,1]}")

# 5. Find optimal threshold using validation set
from sklearn.metrics import precision_recall_curve
print(f"\n5. OPTIMAL THRESHOLD ANALYSIS:")
precision, recall, thresholds_pr = precision_recall_curve(y_val_true, y_val_prob)
f1_scores = 2 * (precision * recall) / (precision + recall)
f1_scores = f1_scores[~np.isnan(f1_scores)]  # Remove NaN values
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds_pr[optimal_idx] if optimal_idx < len(thresholds_pr) else 0.5

print(f"Optimal threshold for F1: {optimal_threshold:.4f}")
print(f"Max F1 score achievable: {f1_scores[optimal_idx]:.4f}")

# 6. Evaluate with optimal threshold
y_val_pred_optimal = (y_val_prob > optimal_threshold).astype(int)
y_test_pred_optimal = (y_pred > optimal_threshold).astype(int)

from sklearn.metrics import classification_report
print(f"\n6. PERFORMANCE WITH OPTIMAL THRESHOLD ({optimal_threshold:.4f}):")
print("Validation:")
val_f1_optimal = f1_score(y_val_true, y_val_pred_optimal)
val_macro_f1_optimal = f1_score(y_val_true, y_val_pred_optimal, average='macro')
print(f"F1: {val_f1_optimal:.4f}, Macro-F1: {val_macro_f1_optimal:.4f}")

print("Test:")
test_f1_optimal = f1_score(y_true, y_test_pred_optimal)
test_macro_f1_optimal = f1_score(y_true, y_test_pred_optimal, average='macro')
print(f"F1: {test_f1_optimal:.4f}, Macro-F1: {test_macro_f1_optimal:.4f}")

# 7. Show distribution of positive class probabilities
print(f"\n7. POSITIVE CLASS PROBABILITY ANALYSIS:")
pos_indices = y_val_true == 1
neg_indices = y_val_true == 0

if pos_indices.sum() > 0:
    pos_probs = y_val_prob[pos_indices]
    neg_probs = y_val_prob[neg_indices]
    
    print(f"Positive class (binding) predictions:")
    print(f"  Count: {len(pos_probs)}")
    print(f"  Mean probability: {pos_probs.mean():.4f}")
    print(f"  Max probability: {pos_probs.max():.4f}")
    print(f"  % above 0.5: {(pos_probs > 0.5).mean()*100:.1f}%")
    
    print(f"Negative class (no binding) predictions:")
    print(f"  Count: {len(neg_probs)}")
    print(f"  Mean probability: {neg_probs.mean():.4f}")
    print(f"  Max probability: {neg_probs.max():.4f}")
    print(f"  % above 0.5: {(neg_probs > 0.5).mean()*100:.1f}%")

print(f"\n8. RECOMMENDATIONS:")
print(f"- Your model has discriminative power (AUC > 0.6) but conservative threshold")
print(f"- Consider using threshold {optimal_threshold:.3f} instead of 0.5")
print(f"- The high accuracy with F1=0 indicates severe class imbalance")
print(f"- Macro-F1 > 0 shows the model isn't completely broken")
print(f"- Consider techniques for imbalanced datasets (SMOTE, class weights, etc.)")

# 9. Quick retraining suggestion with class weights
print(f"\n9. CLASS WEIGHT SUGGESTION:")
neg_count = (train_df[target_col] == 0).sum()
pos_count = (train_df[target_col] == 1).sum()
class_weight_ratio = neg_count / pos_count
print(f"Negative samples: {neg_count}")
print(f"Positive samples: {pos_count}")
print(f"Imbalance ratio: {class_weight_ratio:.2f}:1")
print(f"Consider adding 'scale_pos_weight': {class_weight_ratio:.2f} to LightGBM params")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Combining TCR and Epitope features only...
Final feature dimensions: 1024 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - No categorical features used

Creating LightGBM datasets...

Training LightGBM model (TCR + Epitope embeddings only)...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbosity': -1, 'seed': 42, 'num_leaves': 31, 'learning_rate': 0.05, 'feature

### v2 with macro-f1 and rounded

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your actual paths
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='zero'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean', 'drop')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
            else:  # 'drop' - will be handled later
                embedding_features.append(np.zeros(embedding_dim))  # placeholder
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        if len(missing_sequences) <= 5:  # Show a few examples
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Encode categorical features that don't have embeddings
print("\nEncoding categorical features...")
categorical_cols = ['TRBV', 'TRBJ', 'MHC']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col + '_encoded'] = le.transform(train_df[col].astype(str))
    valid_df[col + '_encoded'] = le.transform(valid_df[col].astype(str))
    test_df[col + '_encoded'] = le.transform(test_df[col].astype(str))
    encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Combine all features
print("\nCombining features...")
encoded_categorical_cols = [col + '_encoded' for col in categorical_cols]

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb, 
    train_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb, 
    valid_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb, 
    test_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features") 
print(f"  - Categorical features: {len(encoded_categorical_cols)} features")

target_col = 'Binding'

# Check for any NaN values
if train_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in training features")
    train_features = train_features.fillna(0)
if valid_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in validation features")
    valid_features = valid_features.fillna(0)
if test_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in test features")
    test_features = test_features.fillna(0)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
train_data = lgb.Dataset(
    train_features, 
    label=train_df[target_col], 
    categorical_feature=encoded_categorical_cols
)
valid_data = lgb.Dataset(
    valid_features, 
    label=valid_df[target_col], 
    reference=train_data, 
    categorical_feature=encoded_categorical_cols
)

# LightGBM parameters - optimized for high-dimensional embedding features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,  # Lower learning rate for stability with embeddings
    'feature_fraction': 0.8,  # Feature subsampling to prevent overfitting
    'bagging_fraction': 0.8,  # Data subsampling
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*50)
print("VALIDATION METRICS")
print("="*50)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'Macro-F1 Score: {f1_score(y_val_true, y_val_pred, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*30)
    print("PER-TASK VALIDATION METRICS")
    print("="*30)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = f1 = macro_f1 = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        avg_f1 = np.mean([r['f1'] for r in valid_tasks])
        avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")
        print(f"  Average F1: {avg_f1:.4f}")
        print(f"  Average Macro-F1: {avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*50)
print("TEST METRICS")
print("="*50)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'Macro-F1 Score: {f1_score(y_true, y_pred_binary, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*30)
    print("PER-TASK TEST METRICS")
    print("="*30)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = f1 = macro_f1 = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        test_avg_f1 = np.mean([r['f1'] for r in valid_test_tasks])
        test_avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")
        print(f"  Average F1: {test_avg_f1:.4f}")
        print(f"  Average Macro-F1: {test_avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
categorical_importance = feature_imp_df[feature_imp_df['feature'].str.endswith('_encoded')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance + categorical_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")
print(f"Categorical features: {categorical_importance:.2f} ({categorical_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# === Model Performance Summary ===
print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"Embedding contribution: {(tcr_emb_importance + epitope_emb_importance)/total_importance*100:.1f}%")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_model.txt'")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Encoding categorical features...
Encoded TRBV: 166 unique values
Encoded TRBJ: 31 unique values
Encoded MHC: 99 unique values

Combining features...
Final feature dimensions: 1027 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - Categorical features: 3 features

Creating LightGBM datasets...

Training LightGBM model...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 

## LightGBM - V1

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.metrics import log_loss, f1_score, precision_score, recall_score

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your 512-dimensional embeddings
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='mean'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        # Show a few examples of missing sequences
        if len(missing_sequences) <= 5:
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
        elif len(missing_sequences) > 5:
            print(f"  First few missing: {[seq for _, seq in missing_sequences[:3]]}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Combine ONLY TCR and Epitope features (no categorical features)
print("\nCombining TCR and Epitope features only...")

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features")
print(f"  - No categorical features used")

target_col = 'Binding'

# Check for any NaN values
for name, features in [("train", train_features), ("valid", valid_features), ("test", test_features)]:
    nan_count = features.isnull().sum().sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} NaN values in {name} features - filling with 0")
        features.fillna(0, inplace=True)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
# Note: No categorical features since we're only using embeddings
train_data = lgb.Dataset(train_features, label=train_df[target_col])
valid_data = lgb.Dataset(valid_features, label=valid_df[target_col], reference=train_data)

# LightGBM parameters - optimized for embedding-only features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,  # Feature subsampling
    'bagging_fraction': 0.8,  # Data subsampling  
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model (TCR + Epitope embeddings only)...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*60)
print("VALIDATION METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'Precision: {precision_score(y_val_true, y_val_pred, zero_division=0):.4f}')
print(f'Recall: {recall_score(y_val_true, y_val_pred, zero_division=0):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'Macro-F1 Score: {f1_score(y_val_true, y_val_pred, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*40)
    print("PER-TASK VALIDATION METRICS")
    print("="*40)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            precision = precision_score(group['true'], group['pred_label'], zero_division=0)
            recall = recall_score(group['true'], group['pred_label'], zero_division=0)
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = precision = recall = f1 = macro_f1 = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        avg_precision = np.mean([r['precision'] for r in valid_tasks])
        avg_recall = np.mean([r['recall'] for r in valid_tasks])
        avg_f1 = np.mean([r['f1'] for r in valid_tasks])
        avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")
        print(f"  Average Precision: {avg_precision:.4f}")
        print(f"  Average Recall: {avg_recall:.4f}")
        print(f"  Average F1: {avg_f1:.4f}")
        print(f"  Average Macro-F1: {avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*60)
print("TEST METRICS (TCR + Epitope Embeddings Only)")
print("="*60)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'Precision: {precision_score(y_true, y_pred_binary, zero_division=0):.4f}')
print(f'Recall: {recall_score(y_true, y_pred_binary, zero_division=0):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'Macro-F1 Score: {f1_score(y_true, y_pred_binary, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*40)
    print("PER-TASK TEST METRICS")
    print("="*40)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            precision = precision_score(group['true'], group['pred_label'], zero_division=0)
            recall = recall_score(group['true'], group['pred_label'], zero_division=0)
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = precision = recall = f1 = macro_f1 = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        test_avg_precision = np.mean([r['precision'] for r in valid_test_tasks])
        test_avg_recall = np.mean([r['recall'] for r in valid_test_tasks])
        test_avg_f1 = np.mean([r['f1'] for r in valid_test_tasks])
        test_avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")
        print(f"  Average Precision: {test_avg_precision:.4f}")
        print(f"  Average Recall: {test_avg_recall:.4f}")
        print(f"  Average F1: {test_avg_f1:.4f}")
        print(f"  Average Macro-F1: {test_avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS (Embeddings Only)")
print("="*60)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# Check if embeddings are well-distributed in importance
tcr_top_5_importance = tcr_features['importance'].sum()
epitope_top_5_importance = epitope_features['importance'].sum()

print(f"\nImportance concentration:")
print(f"Top 5 TCR dims contribute: {tcr_top_5_importance/tcr_emb_importance*100:.1f}% of TCR importance")
print(f"Top 5 Epitope dims contribute: {epitope_top_5_importance/epitope_emb_importance*100:.1f}% of Epitope importance")

# === Model Performance Summary ===
print("\n" + "="*60)
print("PERFORMANCE SUMMARY (Embeddings Only)")
print("="*60)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"TCR embedding contribution: {tcr_emb_importance/total_importance*100:.1f}%")
print(f"Epitope embedding contribution: {epitope_emb_importance/total_importance*100:.1f}%")

# Performance comparison hint
print(f"\nModel uses ONLY sequence embeddings (no MHC, TRBV, TRBJ)")
print(f"This shows the predictive power of TCR-Epitope interaction alone")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_only_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_only_model.txt'")

# Add this diagnostic code after making predictions to understand the F1=0 issue

print("\n" + "="*60)
print("DIAGNOSTIC ANALYSIS FOR F1 = 0")
print("="*60)

# 1. Check class distribution
print("1. CLASS DISTRIBUTION:")
print(f"Training set:")
train_class_dist = train_df[target_col].value_counts(normalize=True)
print(train_class_dist)
print(f"Validation set:")
val_class_dist = valid_df[target_col].value_counts(normalize=True)
print(val_class_dist)
print(f"Test set:")
test_class_dist = test_df[target_col].value_counts(normalize=True)
print(test_class_dist)

# 2. Check prediction probabilities distribution
print(f"\n2. PREDICTION PROBABILITY DISTRIBUTION:")
print(f"Validation predictions - Min: {y_val_prob.min():.4f}, Max: {y_val_prob.max():.4f}, Mean: {y_val_prob.mean():.4f}")
print(f"Test predictions - Min: {y_pred.min():.4f}, Max: {y_pred.max():.4f}, Mean: {y_pred.mean():.4f}")

# 3. Check how many predictions are above different thresholds
print(f"\n3. PREDICTIONS ABOVE DIFFERENT THRESHOLDS:")
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for thresh in thresholds:
    val_above = (y_val_prob > thresh).sum()
    test_above = (y_pred > thresh).sum()
    print(f"Threshold {thresh}: Validation={val_above}/{len(y_val_prob)} ({val_above/len(y_val_prob)*100:.1f}%), Test={test_above}/{len(y_pred)} ({test_above/len(y_pred)*100:.1f}%)")

# 4. Check confusion matrix with current threshold
from sklearn.metrics import confusion_matrix
print(f"\n4. CONFUSION MATRIX (threshold=0.5):")
print("Validation:")
val_cm = confusion_matrix(y_val_true, y_val_pred)
print(f"TN: {val_cm[0,0]}, FP: {val_cm[0,1]}")
print(f"FN: {val_cm[1,0]}, TP: {val_cm[1,1]}")

print("Test:")
test_cm = confusion_matrix(y_true, y_pred_binary)
print(f"TN: {test_cm[0,0]}, FP: {test_cm[0,1]}")
print(f"FN: {test_cm[1,0]}, TP: {test_cm[1,1]}")

# 5. Find optimal threshold using validation set
from sklearn.metrics import precision_recall_curve
print(f"\n5. OPTIMAL THRESHOLD ANALYSIS:")
precision, recall, thresholds_pr = precision_recall_curve(y_val_true, y_val_prob)
f1_scores = 2 * (precision * recall) / (precision + recall)
f1_scores = f1_scores[~np.isnan(f1_scores)]  # Remove NaN values
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds_pr[optimal_idx] if optimal_idx < len(thresholds_pr) else 0.5

print(f"Optimal threshold for F1: {optimal_threshold:.4f}")
print(f"Max F1 score achievable: {f1_scores[optimal_idx]:.4f}")

# 6. Evaluate with optimal threshold
y_val_pred_optimal = (y_val_prob > optimal_threshold).astype(int)
y_test_pred_optimal = (y_pred > optimal_threshold).astype(int)

from sklearn.metrics import classification_report
print(f"\n6. PERFORMANCE WITH OPTIMAL THRESHOLD ({optimal_threshold:.4f}):")
print("Validation:")
val_f1_optimal = f1_score(y_val_true, y_val_pred_optimal)
val_macro_f1_optimal = f1_score(y_val_true, y_val_pred_optimal, average='macro')
val_precision_optimal = precision_score(y_val_true, y_val_pred_optimal, zero_division=0)
val_recall_optimal = recall_score(y_val_true, y_val_pred_optimal, zero_division=0)
print(f"Precision: {val_precision_optimal:.4f}, Recall: {val_recall_optimal:.4f}")
print(f"F1: {val_f1_optimal:.4f}, Macro-F1: {val_macro_f1_optimal:.4f}")

print("Test:")
test_f1_optimal = f1_score(y_true, y_test_pred_optimal)
test_macro_f1_optimal = f1_score(y_true, y_test_pred_optimal, average='macro')
test_precision_optimal = precision_score(y_true, y_test_pred_optimal, zero_division=0)
test_recall_optimal = recall_score(y_true, y_test_pred_optimal, zero_division=0)
print(f"Precision: {test_precision_optimal:.4f}, Recall: {test_recall_optimal:.4f}")
print(f"F1: {test_f1_optimal:.4f}, Macro-F1: {test_macro_f1_optimal:.4f}")

# 7. Show distribution of positive class probabilities
print(f"\n7. POSITIVE CLASS PROBABILITY ANALYSIS:")
pos_indices = y_val_true == 1
neg_indices = y_val_true == 0

if pos_indices.sum() > 0:
    pos_probs = y_val_prob[pos_indices]
    neg_probs = y_val_prob[neg_indices]
    
    print(f"Positive class (binding) predictions:")
    print(f"  Count: {len(pos_probs)}")
    print(f"  Mean probability: {pos_probs.mean():.4f}")
    print(f"  Max probability: {pos_probs.max():.4f}")
    print(f"  % above 0.5: {(pos_probs > 0.5).mean()*100:.1f}%")
    
    print(f"Negative class (no binding) predictions:")
    print(f"  Count: {len(neg_probs)}")
    print(f"  Mean probability: {neg_probs.mean():.4f}")
    print(f"  Max probability: {neg_probs.max():.4f}")
    print(f"  % above 0.5: {(neg_probs > 0.5).mean()*100:.1f}%")

print(f"\n8. RECOMMENDATIONS:")
print(f"- Your model has discriminative power (AUC > 0.6) but conservative threshold")
print(f"- Consider using threshold {optimal_threshold:.3f} instead of 0.5")
print(f"- The high accuracy with F1=0 indicates severe class imbalance")
print(f"- Macro-F1 > 0 shows the model isn't completely broken")
print(f"- Consider techniques for imbalanced datasets (SMOTE, class weights, etc.)")

# 9. Quick retraining suggestion with class weights
print(f"\n9. CLASS WEIGHT SUGGESTION:")
neg_count = (train_df[target_col] == 0).sum()
pos_count = (train_df[target_col] == 1).sum()
class_weight_ratio = neg_count / pos_count
print(f"Negative samples: {neg_count}")
print(f"Positive samples: {pos_count}")
print(f"Imbalance ratio: {class_weight_ratio:.2f}:1")
print(f"Consider adding 'scale_pos_weight': {class_weight_ratio:.2f} to LightGBM params")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Combining TCR and Epitope features only...
Final feature dimensions: 1024 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - No categorical features used

Creating LightGBM datasets...

Training LightGBM model (TCR + Epitope embeddings only)...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'verbosity': -1, 'seed': 42, 'num_leaves': 31, 'learning_rate': 0.05, 'feature

## LightGBM - V2

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss, precision_score, recall_score

# File paths
test_path = '../../../../../data/splitted_datasets/allele/beta/test.tsv'
train_path = '../../../../../data/splitted_datasets/allele/beta/train.tsv'
valid_path = '../../../../../data/splitted_datasets/allele/beta/validation.tsv'

# Embedding file paths - update these to your actual paths
tcr_embedding_path = '../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl'
epitope_embedding_path = '../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl'

# Load the TSV files
print("Loading datasets...")
train_df = pd.read_csv(train_path, sep='\t')
valid_df = pd.read_csv(valid_path, sep='\t', low_memory=False)
test_df = pd.read_csv(test_path, sep='\t')

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(valid_df)} samples")
print(f"Test set: {len(test_df)} samples")

def load_reduced_embeddings(embedding_path):
    """Load pre-computed reduced embeddings from pickle file"""
    with open(embedding_path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} embeddings from {embedding_path}")
    return embeddings

def get_embedding_features(df, sequence_col, embeddings_dict, prefix, missing_strategy='zero'):
    """
    Convert sequences to embedding features using pre-computed embeddings
    
    Args:
        df: DataFrame containing sequences
        sequence_col: Column name containing sequences
        embeddings_dict: Dictionary mapping sequences to embeddings
        prefix: Prefix for feature column names
        missing_strategy: How to handle missing sequences ('zero', 'mean', 'drop')
    """
    embedding_features = []
    missing_sequences = []
    
    # Get embedding dimension from first embedding
    embedding_dim = len(next(iter(embeddings_dict.values())))
    print(f"Embedding dimension for {prefix}: {embedding_dim}")
    
    # Compute mean embedding for missing sequences if needed
    if missing_strategy == 'mean':
        all_embeddings = np.array(list(embeddings_dict.values()))
        mean_embedding = np.mean(all_embeddings, axis=0)
    
    for idx, seq in enumerate(df[sequence_col]):
        if seq in embeddings_dict:
            embedding_features.append(embeddings_dict[seq])
        else:
            missing_sequences.append((idx, seq))
            if missing_strategy == 'zero':
                embedding_features.append(np.zeros(embedding_dim))
            elif missing_strategy == 'mean':
                embedding_features.append(mean_embedding)
            else:  # 'drop' - will be handled later
                embedding_features.append(np.zeros(embedding_dim))  # placeholder
    
    if missing_sequences:
        print(f"Warning: {len(missing_sequences)} sequences not found in {prefix} embeddings")
        print(f"Using {missing_strategy} strategy for missing sequences")
        if len(missing_sequences) <= 5:  # Show a few examples
            for idx, seq in missing_sequences[:5]:
                print(f"  Missing: {seq}")
    
    # Convert to DataFrame with proper column names
    embedding_df = pd.DataFrame(
        embedding_features, 
        columns=[f'{prefix}_emb_{i}' for i in range(embedding_dim)],
        index=df.index
    )
    
    return embedding_df, missing_sequences

# Load pre-computed embeddings
print("\nLoading embeddings...")
tcr_embeddings = load_reduced_embeddings(tcr_embedding_path)
epitope_embeddings = load_reduced_embeddings(epitope_embedding_path)

# Convert sequences to embeddings for all datasets
print("\nConverting sequences to embeddings...")

# TCR embeddings
train_tcr_emb, train_tcr_missing = get_embedding_features(
    train_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
valid_tcr_emb, valid_tcr_missing = get_embedding_features(
    valid_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)
test_tcr_emb, test_tcr_missing = get_embedding_features(
    test_df, 'TRB_CDR3', tcr_embeddings, 'tcr', missing_strategy='mean'
)

# Epitope embeddings
train_epitope_emb, train_epitope_missing = get_embedding_features(
    train_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
valid_epitope_emb, valid_epitope_missing = get_embedding_features(
    valid_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)
test_epitope_emb, test_epitope_missing = get_embedding_features(
    test_df, 'Epitope', epitope_embeddings, 'epitope', missing_strategy='mean'
)

# Encode categorical features that don't have embeddings
print("\nEncoding categorical features...")
categorical_cols = ['TRBV', 'TRBJ', 'MHC']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    all_data = pd.concat([train_df[col], valid_df[col], test_df[col]], axis=0)
    le.fit(all_data.astype(str))
    train_df[col + '_encoded'] = le.transform(train_df[col].astype(str))
    valid_df[col + '_encoded'] = le.transform(valid_df[col].astype(str))
    test_df[col + '_encoded'] = le.transform(test_df[col].astype(str))
    encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Combine all features
print("\nCombining features...")
encoded_categorical_cols = [col + '_encoded' for col in categorical_cols]

train_features = pd.concat([
    train_tcr_emb, 
    train_epitope_emb, 
    train_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

valid_features = pd.concat([
    valid_tcr_emb, 
    valid_epitope_emb, 
    valid_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

test_features = pd.concat([
    test_tcr_emb, 
    test_epitope_emb, 
    test_df[encoded_categorical_cols].reset_index(drop=True)
], axis=1)

print(f"Final feature dimensions: {train_features.shape[1]} features")
print(f"  - TCR embeddings: {train_tcr_emb.shape[1]} features")
print(f"  - Epitope embeddings: {train_epitope_emb.shape[1]} features") 
print(f"  - Categorical features: {len(encoded_categorical_cols)} features")

target_col = 'Binding'

# Check for any NaN values
if train_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in training features")
    train_features = train_features.fillna(0)
if valid_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in validation features")
    valid_features = valid_features.fillna(0)
if test_features.isnull().sum().sum() > 0:
    print("Warning: Found NaN values in test features")
    test_features = test_features.fillna(0)

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
train_data = lgb.Dataset(
    train_features, 
    label=train_df[target_col], 
    categorical_feature=encoded_categorical_cols
)
valid_data = lgb.Dataset(
    valid_features, 
    label=valid_df[target_col], 
    reference=train_data, 
    categorical_feature=encoded_categorical_cols
)

# LightGBM parameters - optimized for high-dimensional embedding features
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'num_leaves': 31,
    'learning_rate': 0.05,  # Lower learning rate for stability with embeddings
    'feature_fraction': 0.8,  # Feature subsampling to prevent overfitting
    'bagging_fraction': 0.8,  # Data subsampling
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
}

print("\nTraining LightGBM model...")
print("Parameters:", params)

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'val'],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print(f"\nTraining completed. Best iteration: {model.best_iteration}")

# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(test_features, num_iteration=model.best_iteration)
y_pred_binary = (y_pred > 0.5).astype(int)
y_true = test_df[target_col]

# === Overall Validation Evaluation ===
print("\n" + "="*60)
print("VALIDATION METRICS (Embeddings + Categorical Features)")
print("="*60)
y_val_prob = model.predict(valid_features, num_iteration=model.best_iteration)
y_val_pred = (y_val_prob > 0.5).astype(int)
y_val_true = valid_df[target_col]

print(f"Log Loss: {log_loss(y_val_true, y_val_prob):.4f}")
print(f'Accuracy: {accuracy_score(y_val_true, y_val_pred):.4f}')
print(f'Precision: {precision_score(y_val_true, y_val_pred, zero_division=0):.4f}')
print(f'Recall: {recall_score(y_val_true, y_val_pred, zero_division=0):.4f}')
print(f'AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}')
print(f'F1 Score: {f1_score(y_val_true, y_val_pred):.4f}')
print(f'Macro-F1 Score: {f1_score(y_val_true, y_val_pred, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_val_true, y_val_prob):.4f}')

# === Per-task Validation Evaluation ===
if 'task' in valid_df.columns:
    print("\n" + "="*40)
    print("PER-TASK VALIDATION METRICS")
    print("="*40)
    valid_df_copy = valid_df.copy()
    valid_df_copy['true'] = y_val_true
    valid_df_copy['pred_prob'] = y_val_prob
    valid_df_copy['pred_label'] = y_val_pred

    task_results = []
    for task_name, group in valid_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            precision = precision_score(group['true'], group['pred_label'], zero_division=0)
            recall = recall_score(group['true'], group['pred_label'], zero_division=0)
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = precision = recall = f1 = macro_f1 = "Undefined (only one class present)"
            task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_tasks = [r for r in task_results if isinstance(r['auc'], float)]
    if valid_tasks:
        avg_auc = np.mean([r['auc'] for r in valid_tasks])
        avg_acc = np.mean([r['accuracy'] for r in valid_tasks])
        avg_precision = np.mean([r['precision'] for r in valid_tasks])
        avg_recall = np.mean([r['recall'] for r in valid_tasks])
        avg_f1 = np.mean([r['f1'] for r in valid_tasks])
        avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_tasks])
        print(f"\nAverage across tasks (excluding undefined):")
        print(f"  Average AUC: {avg_auc:.4f}")
        print(f"  Average Accuracy: {avg_acc:.4f}")
        print(f"  Average Precision: {avg_precision:.4f}")
        print(f"  Average Recall: {avg_recall:.4f}")
        print(f"  Average F1: {avg_f1:.4f}")
        print(f"  Average Macro-F1: {avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in validation set; skipping per-task evaluation.")

# === Overall Test Evaluation ===
print("\n" + "="*60)
print("TEST METRICS (Embeddings + Categorical Features)")
print("="*60)
print(f'Accuracy: {accuracy_score(y_true, y_pred_binary):.4f}')
print(f'Precision: {precision_score(y_true, y_pred_binary, zero_division=0):.4f}')
print(f'Recall: {recall_score(y_true, y_pred_binary, zero_division=0):.4f}')
print(f'AUC: {roc_auc_score(y_true, y_pred):.4f}')
print(f'F1 Score: {f1_score(y_true, y_pred_binary):.4f}')
print(f'Macro-F1 Score: {f1_score(y_true, y_pred_binary, average="macro"):.4f}')
print(f'AP Score: {average_precision_score(y_true, y_pred):.4f}')
print(f"Log Loss: {log_loss(y_true, y_pred):.4f}")

# === Per-task Test Evaluation ===
if 'task' in test_df.columns:
    print("\n" + "="*40)
    print("PER-TASK TEST METRICS")
    print("="*40)
    test_df_copy = test_df.copy()
    test_df_copy['true'] = y_true
    test_df_copy['pred_prob'] = y_pred
    test_df_copy['pred_label'] = y_pred_binary

    test_task_results = []
    for task_name, group in test_df_copy.groupby('task'):
        acc = accuracy_score(group['true'], group['pred_label'])
        try:
            loss = log_loss(group['true'], group['pred_prob'], labels=[0, 1])
            auc = roc_auc_score(group['true'], group['pred_prob'])
            ap = average_precision_score(group['true'], group['pred_prob'])
            precision = precision_score(group['true'], group['pred_label'], zero_division=0)
            recall = recall_score(group['true'], group['pred_label'], zero_division=0)
            f1 = f1_score(group['true'], group['pred_label'])
            macro_f1 = f1_score(group['true'], group['pred_label'], average='macro')
            
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })
            
        except ValueError:
            loss = auc = ap = precision = recall = f1 = macro_f1 = "Undefined (only one class present)"
            test_task_results.append({
                'task': task_name,
                'n_samples': len(group),
                'accuracy': acc,
                'log_loss': loss,
                'auc': auc,
                'ap': ap,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'macro_f1': macro_f1
            })

        print(f"\nTask: {task_name} (n={len(group)})")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  Log Loss: {loss:.4f}")
        print(f"  ROC AUC: {auc:.4f}")
        print(f"  Average Precision: {ap:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  Macro-F1 Score: {macro_f1:.4f}")
    
    # Summary of per-task performance
    valid_test_tasks = [r for r in test_task_results if isinstance(r['auc'], float)]
    if valid_test_tasks:
        test_avg_auc = np.mean([r['auc'] for r in valid_test_tasks])
        test_avg_acc = np.mean([r['accuracy'] for r in valid_test_tasks])
        test_avg_precision = np.mean([r['precision'] for r in valid_test_tasks])
        test_avg_recall = np.mean([r['recall'] for r in valid_test_tasks])
        test_avg_f1 = np.mean([r['f1'] for r in valid_test_tasks])
        test_avg_macro_f1 = np.mean([r['macro_f1'] for r in valid_test_tasks])
        print(f"\nAverage across test tasks (excluding undefined):")
        print(f"  Average AUC: {test_avg_auc:.4f}")
        print(f"  Average Accuracy: {test_avg_acc:.4f}")
        print(f"  Average Precision: {test_avg_precision:.4f}")
        print(f"  Average Recall: {test_avg_recall:.4f}")
        print(f"  Average F1: {test_avg_f1:.4f}")
        print(f"  Average Macro-F1: {test_avg_macro_f1:.4f}")

else:
    print("\nNote: 'task' column not found in test set; skipping per-task evaluation.")

# === Feature Importance Analysis ===
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS (Embeddings + Categorical)")
print("="*60)

importance = model.feature_importance(importance_type='gain')
feature_names = train_features.columns
feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_imp_df.head(15).to_string(index=False))

# Analyze feature group importance
tcr_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')]['importance'].sum()
epitope_emb_importance = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')]['importance'].sum()
categorical_importance = feature_imp_df[feature_imp_df['feature'].str.endswith('_encoded')]['importance'].sum()
total_importance = tcr_emb_importance + epitope_emb_importance + categorical_importance

print(f"\nFeature group importance:")
print(f"TCR embeddings: {tcr_emb_importance:.2f} ({tcr_emb_importance/total_importance*100:.1f}%)")
print(f"Epitope embeddings: {epitope_emb_importance:.2f} ({epitope_emb_importance/total_importance*100:.1f}%)")
print(f"Categorical features: {categorical_importance:.2f} ({categorical_importance/total_importance*100:.1f}%)")

# Show individual categorical feature importance
print(f"\nCategorical feature importance breakdown:")
for col in encoded_categorical_cols:
    col_importance = feature_imp_df[feature_imp_df['feature'] == col]['importance'].sum()
    print(f"  {col}: {col_importance:.2f} ({col_importance/total_importance*100:.1f}%)")

# Show most important embedding dimensions
print(f"\nTop 5 most important TCR embedding dimensions:")
tcr_features = feature_imp_df[feature_imp_df['feature'].str.startswith('tcr_emb')].head(5)
for _, row in tcr_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

print(f"\nTop 5 most important Epitope embedding dimensions:")
epitope_features = feature_imp_df[feature_imp_df['feature'].str.startswith('epitope_emb')].head(5)
for _, row in epitope_features.iterrows():
    dim = row['feature'].split('_')[-1]
    print(f"  Dimension {dim}: {row['importance']:.2f}")

# === Model Performance Summary ===
print("\n" + "="*60)
print("PERFORMANCE SUMMARY (Embeddings + Categorical)")
print("="*60)
print(f"Best validation AUC: {roc_auc_score(y_val_true, y_val_prob):.4f}")
print(f"Test AUC: {roc_auc_score(y_true, y_pred):.4f}")
print(f"Training iterations: {model.best_iteration}")
print(f"Total features used: {train_features.shape[1]}")
print(f"Embedding contribution: {(tcr_emb_importance + epitope_emb_importance)/total_importance*100:.1f}%")
print(f"Categorical contribution: {categorical_importance/total_importance*100:.1f}%")

# Performance comparison hint
print(f"\nModel uses sequence embeddings + categorical features (MHC, TRBV, TRBJ)")
print(f"This shows the combined predictive power of sequence and context information")

# Optional: Save the trained model
# model.save_model('lightgbm_tcr_epitope_model.txt')
# print("\nModel saved to 'lightgbm_tcr_epitope_model.txt'")

# === Additional Diagnostic Analysis ===
print("\n" + "="*60)
print("DIAGNOSTIC ANALYSIS")
print("="*60)

# 1. Check class distribution
print("1. CLASS DISTRIBUTION:")
print(f"Training set:")
train_class_dist = train_df[target_col].value_counts(normalize=True)
print(train_class_dist)
print(f"Validation set:")
val_class_dist = valid_df[target_col].value_counts(normalize=True)
print(val_class_dist)
print(f"Test set:")
test_class_dist = test_df[target_col].value_counts(normalize=True)
print(test_class_dist)

# 2. Check prediction probabilities distribution
print(f"\n2. PREDICTION PROBABILITY DISTRIBUTION:")
print(f"Validation predictions - Min: {y_val_prob.min():.4f}, Max: {y_val_prob.max():.4f}, Mean: {y_val_prob.mean():.4f}")
print(f"Test predictions - Min: {y_pred.min():.4f}, Max: {y_pred.max():.4f}, Mean: {y_pred.mean():.4f}")

# 3. Check how many predictions are above different thresholds
print(f"\n3. PREDICTIONS ABOVE DIFFERENT THRESHOLDS:")
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for thresh in thresholds:
    val_above = (y_val_prob > thresh).sum()
    test_above = (y_pred > thresh).sum()
    print(f"Threshold {thresh}: Validation={val_above}/{len(y_val_prob)} ({val_above/len(y_val_prob)*100:.1f}%), Test={test_above}/{len(y_pred)} ({test_above/len(y_pred)*100:.1f}%)")

# 4. Find optimal threshold using validation set
from sklearn.metrics import precision_recall_curve
print(f"\n4. OPTIMAL THRESHOLD ANALYSIS:")
precision, recall, thresholds_pr = precision_recall_curve(y_val_true, y_val_prob)
f1_scores = 2 * (precision * recall) / (precision + recall)
f1_scores = f1_scores[~np.isnan(f1_scores)]  # Remove NaN values
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds_pr[optimal_idx] if optimal_idx < len(thresholds_pr) else 0.5

print(f"Optimal threshold for F1: {optimal_threshold:.4f}")
print(f"Max F1 score achievable: {f1_scores[optimal_idx]:.4f}")

# 5. Evaluate with optimal threshold
y_val_pred_optimal = (y_val_prob > optimal_threshold).astype(int)
y_test_pred_optimal = (y_pred > optimal_threshold).astype(int)

print(f"\n5. PERFORMANCE WITH OPTIMAL THRESHOLD ({optimal_threshold:.4f}):")
print("Validation:")
val_f1_optimal = f1_score(y_val_true, y_val_pred_optimal)
val_macro_f1_optimal = f1_score(y_val_true, y_val_pred_optimal, average='macro')
val_precision_optimal = precision_score(y_val_true, y_val_pred_optimal, zero_division=0)
val_recall_optimal = recall_score(y_val_true, y_val_pred_optimal, zero_division=0)
print(f"Precision: {val_precision_optimal:.4f}, Recall: {val_recall_optimal:.4f}")
print(f"F1: {val_f1_optimal:.4f}, Macro-F1: {val_macro_f1_optimal:.4f}")

print("Test:")
test_f1_optimal = f1_score(y_true, y_test_pred_optimal)
test_macro_f1_optimal = f1_score(y_true, y_test_pred_optimal, average='macro')
test_precision_optimal = precision_score(y_true, y_test_pred_optimal, zero_division=0)
test_recall_optimal = recall_score(y_true, y_test_pred_optimal, zero_division=0)
print(f"Precision: {test_precision_optimal:.4f}, Recall: {test_recall_optimal:.4f}")
print(f"F1: {test_f1_optimal:.4f}, Macro-F1: {test_macro_f1_optimal:.4f}")

# 6. Class weight suggestion
print(f"\n6. CLASS WEIGHT SUGGESTION:")
neg_count = (train_df[target_col] == 0).sum()
pos_count = (train_df[target_col] == 1).sum()
class_weight_ratio = neg_count / pos_count
print(f"Negative samples: {neg_count}")
print(f"Positive samples: {pos_count}")
print(f"Imbalance ratio: {class_weight_ratio:.2f}:1")
print(f"Consider adding 'scale_pos_weight': {class_weight_ratio:.2f} to LightGBM params")

print(f"\n7. RECOMMENDATIONS:")
print(f"- Compare this model with embeddings-only version to see categorical feature impact")
print(f"- Consider using threshold {optimal_threshold:.3f} instead of 0.5")
print(f"- Categorical features contribute {categorical_importance/total_importance*100:.1f}% of importance")
print(f"- If precision/recall are still low, consider class balancing techniques")
print(f"- The combination of embeddings + categorical may improve generalization")

# 8. Compare with embeddings-only baseline
print(f"\n8. MODEL COMPARISON INSIGHTS:")
print(f"This model includes both sequence embeddings AND categorical features:")
print(f"  - TCR + Epitope embeddings: {(tcr_emb_importance + epitope_emb_importance)/total_importance*100:.1f}%")
print(f"  - MHC + TRBV + TRBJ features: {categorical_importance/total_importance*100:.1f}%")
print(f"Compare AUC with embeddings-only model to quantify categorical feature value")
print(f"If categorical features show high importance, they're capturing crucial biological context")

Loading datasets...


  train_df = pd.read_csv(train_path, sep='\t')


Train set: 755758 samples
Validation set: 169029 samples
Test set: 54126 samples

Loading embeddings...
Loaded 211294 embeddings from ../../../../../data/embeddings/beta/allele/TRB_reduced_512_select.pkl
Loaded 1896 embeddings from ../../../../../data/embeddings/beta/allele/Epitope_reduced_512_select.pkl

Converting sequences to embeddings...
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for tcr: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512
Embedding dimension for epitope: 512

Encoding categorical features...
Encoded TRBV: 166 unique values
Encoded TRBJ: 31 unique values
Encoded MHC: 99 unique values

Combining features...
Final feature dimensions: 1027 features
  - TCR embeddings: 512 features
  - Epitope embeddings: 512 features
  - Categorical features: 3 features

Creating LightGBM datasets...

Training LightGBM model...
Parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 