# Clintox XGBoost

In [60]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [45]:
# Load dataset
clintox = pd.read_csv("../data/interim/clintox_ecfp4.csv")

In [65]:
# Check outcome imbalance - toxic(1) vs non-toxic(0)
clintox["CT_TOX"].value_counts()

0    1366
1     112
Name: CT_TOX, dtype: int64

In [74]:
# Convert each bit of bitstring to feature
X = clintox["ECFP4"].apply(lambda x: pd.Series(list(x))).astype(int)

# Get outcome
y = clintox['CT_TOX']

# Create 80-20 test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameters for the XGBoost model
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'gamma': [0.01, 0.1, 0.2, 0.5, 1, 2, 5],
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5]
}

# Create a XGBoost classifier with 10x weighting to positive cases
xgb_model = xgb.XGBClassifier(eval_metric='logloss', scale_pos_weight=10)

# Setup the random search with 4-fold cross validation
random_search = RandomizedSearchCV(xgb_model, param_grid, cv=4, n_iter=50, random_state=42)

random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters = {best_params}")

# Train the XGBoost model with the best parameters
num_round = 20
model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

Best parameters = {'max_depth': 9, 'gamma': 0.01, 'eta': 0.4}




In [75]:
# Make predictions on the test set
preds_proba = model.predict_proba(X_test)[:, 1]
preds = model.predict(X_test)

# Calculate the AUROC score
auroc = roc_auc_score(y_test, preds_proba)
print(f"AUROC = {auroc}")

# Calculate Precision
precision = precision_score(y_test, preds)
print(f"Precision = {precision}")

# Calculate Recall
recall = recall_score(y_test, preds)
print(f"Recall = {recall}")

# Calculate Accuracy
accuracy = accuracy_score(y_test, preds)
print(f"Accuracy = {accuracy}")

AUROC = 0.8527113970588235
Precision = 0.7333333333333333
Recall = 0.4583333333333333
Accuracy = 0.9425675675675675
