# XG Boost

In [9]:
import numpy as np 
import pandas as pd 
import xgboost as xgb 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
    precision_recall_curve,
    auc
)

In [2]:
import sys
!{sys.executable} -m pip install --upgrade pip setuptools wheel
!{sys.executable} -m pip install scikit-learn
import sklearn
print(sklearn.__version__)

1.7.2


In [3]:
df = pd.read_csv('Data/f1_agg_mean.csv')
df.shape

(121838, 14)

In [4]:
# define features
feature_cols = [c for c in df.columns if c not in ['gene_id', 'transcript_id', 'transcript_position', 'label', 'sequence']]

# split into train and test by gene_id
gene_ids = df['gene_id'].unique()
train_genes, test_genes = train_test_split(gene_ids, test_size=0.2, random_state=42)

train_df = df[df['gene_id'].isin(train_genes)]
test_df = df[df['gene_id'].isin(test_genes)]

X_train, y_train = train_df[feature_cols], train_df['label']
X_test, y_test = test_df[feature_cols], test_df['label']


In [5]:
print("Train genes:", train_df['gene_id'].nunique())
print("Test genes:", test_df['gene_id'].nunique())

Train genes: 3081
Test genes: 771


In [11]:
# train XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
model.fit(X_train, y_train)

# make predictions
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall_vals, precision_vals)

# ===== 7️⃣ Output results =====
print("=== Evaluation Metrics ===")
print("Confusion Matrix:\n", cm)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

=== Evaluation Metrics ===
Confusion Matrix:
 [[23862   133]
 [  752   252]]
Precision: 0.6545
Recall: 0.2510
F1 Score: 0.3629
Accuracy: 0.9646
ROC AUC: 0.8769
PR AUC: 0.4052


### Scaling using class weighting by XGBoost

In [13]:
num_pos = sum(y_train == 1)
num_neg = sum(y_train == 0)

scale_pos_weight = num_neg / num_pos

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
model.fit(X_train, y_train)

# make predictions
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall_vals, precision_vals)

# ===== 7️⃣ Output results =====
print("=== Evaluation Metrics ===")
print("Confusion Matrix:\n", cm)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

=== Evaluation Metrics ===
Confusion Matrix:
 [[20540  3455]
 [  278   726]]
Precision: 0.1736
Recall: 0.7231
F1 Score: 0.2800
Accuracy: 0.8507
ROC AUC: 0.8725
PR AUC: 0.4025


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'scale_pos_weight': [scale_pos_weight]
}

grid = GridSearchCV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    ),
    param_grid=param_grid,
    scoring='roc_auc',  # or 'average_precision' for PR AUC
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300, 'scale_pos_weight': 20.6593603220756, 'subsample': 0.8}
