# Tox21 XGBoost

In [46]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [47]:
# Load dataset
tox21 = pd.read_csv("../data/interim/tox21_ecfp4.csv")

In [49]:
# Create empty dataframe
outcomes = pd.DataFrame(columns=['assay', 'auroc', 'precision', 'recall', 'accuracy', 'max_depth', 'gamma', 'eta'])

# Get list of assays
assays = tox21.columns[:12].tolist()

print("Started model fitting...")

# Run model on each assay
for i, assay in enumerate(assays):

    # Filter rows with NAs
    tmp = tox21[[assay, 'ECFP4']]
    tmp = tmp.dropna()
    
    # Convert each bit of bitstring to feature
    X = tmp["ECFP4"].apply(lambda x: pd.Series(list(x))).astype(int)
    
    # Get outcome
    y = tmp[assay]
    
    # Create 80-20 test-train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the parameters for the XGBoost model
    param_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8],
        'gamma': [0.01, 0.1, 0.5, 1, 2, 5],
        'eta': [0.1, 0.2, 0.4, 0.5]
    }
    
    # Get ratio of cases to controls
    ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
    remainder = ratio % 5
    ratio -= remainder
        
    # Create a XGBoost classifier with appropriate weighting to positive cases
    xgb_model = xgb.XGBClassifier(eval_metric='logloss',
                                  scale_pos_weight=ratio
                                 )

    # Setup the random search with 4-fold cross validation
    random_search = RandomizedSearchCV(xgb_model, param_grid, cv=4, n_iter=20, random_state=42)

    # Carry out the random search
    random_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = random_search.best_params_
    print(f"Best parameters for {assay}: {best_params}")
    
    # Train the XGBoost model with the best parameters
    num_round = 20
    model = xgb.XGBClassifier(**best_params, eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    preds_proba = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test)

    # Calculate the AUROC score
    auroc = roc_auc_score(y_test, preds_proba)

    # Calculate Precision
    precision = precision_score(y_test, preds)

    # Calculate Recall
    recall = recall_score(y_test, preds)
  
    # Calculate Accuracy
    accuracy = accuracy_score(y_test, preds)
    
    # Add AUROC and params to dataframe
    new_row = {'assay': assay, 'auroc': auroc, 'precision': precision, 'recall': recall, 'accuracy': accuracy, 'max_depth': best_params['max_depth'], 'gamma': best_params['gamma'], 'eta': best_params['eta']}
    new_row = pd.DataFrame([new_row])  # Convert the dictionary to a single-row DataFrame
    outcomes = pd.concat([outcomes, new_row], ignore_index=True)
    
    # Output progress
    print(f"{i+1} model(s) fit")
    
# Write csv to data/interim
outcomes.to_csv('../data/interim/tox21-xgboost-outcomes.csv', index=False)

Started model fitting...
Best parameters for NR-AR: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
1 model(s) fit
Best parameters for NR-AR-LBD: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
2 model(s) fit
Best parameters for NR-AhR: {'max_depth': 8, 'gamma': 0.5, 'eta': 0.5}
3 model(s) fit
Best parameters for NR-Aromatase: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
4 model(s) fit
Best parameters for NR-ER: {'max_depth': 6, 'gamma': 2, 'eta': 0.1}
5 model(s) fit
Best parameters for NR-ER-LBD: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
6 model(s) fit
Best parameters for NR-PPAR-gamma: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
7 model(s) fit
Best parameters for SR-ARE: {'max_depth': 8, 'gamma': 0.5, 'eta': 0.5}
8 model(s) fit
Best parameters for SR-ATAD5: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
9 model(s) fit
Best parameters for SR-HSE: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
10 model(s) fit
Best parameters for SR-MMP: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
11 model(s) fit
Best param