# Tox21 XGBoost

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [2]:
# Load dataset
tox21 = pd.read_csv("../data/interim/tox21_ecfp4.csv")

In [4]:
tox21.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,ECFP4
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0000000000000000000000000000000001000000000000...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,0000010000000000000000000101000001100000000000...
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,0000100000010001000000000000000001001000000000...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0100000000000001000000000000000001000000000000...
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,0000000000000000000000000000000001000000000000...


In [None]:
# Create empty dataframe
outcomes = pd.DataFrame(columns=['assay', 'auroc', 'max_depth', 'gamma', 'eta'])

# Get list of assays
assays = tox21.columns[:12].tolist()

# Run model on each assay
for assay in assays:

    # Filter rows with NAs
    tmp = tox21[[assay, 'ECFP4']]
    tmp = tmp.dropna()
    
    # Convert each bit of bitstring to feature
    X = tmp["ECFP4"].apply(lambda x: pd.Series(list(x))).astype(int)
    
    # Get outcome
    y = tmp[assay]
    
    # Create 80-20 test-train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the parameters for the XGBoost model
    param_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8],
        'gamma': [0.01, 0.1, 0.5, 1, 2, 5],
        'eta': [0.1, 0.2, 0.4, 0.5]
    }
    
    # Get ratio of cases to controls
    ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
    remainder = ratio % 5
    ratio -= remainder

    # Create a XGBoost classifier with appropriate weighting to positive cases
    xgb_model = xgb.XGBClassifier(eval_metric='logloss',
                                  scale_pos_weight=ratio
                                 )

    # Setup the random search with 4-fold cross validation
    random_search = RandomizedSearchCV(xgb_model, param_grid, cv=4, n_iter=20, random_state=42)

    # Carry out the random search
    random_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = random_search.best_params_
    print(f"Best parameters for {assay}: {best_params}")
    
    # Train the XGBoost model with the best parameters
    num_round = 20
    model = xgb.XGBClassifier(**best_params, eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    preds_proba = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test)

    # Calculate the AUROC score
    auroc = roc_auc_score(y_test, preds_proba)
    
    # Add AUROC and params to dataframe
    new_row = {'assay': assay, 'auroc': auroc, 'max_depth': best_params['max_depth'], 'gamma': best_params['gamma'], 'eta': best_params['eta']}
    new_row = pd.DataFrame([new_row])  # Convert the dictionary to a single-row DataFrame
    outcomes = pd.concat([outcomes, new_row], ignore_index=True)
    
# Write csv to data/interim
outcomes.to_csv('data/interim/tox21-xgboost-outcomes.csv', index=False)

Best parameters for NR-AR: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
Best parameters for NR-AR-LBD: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
Best parameters for NR-AhR: {'max_depth': 8, 'gamma': 0.5, 'eta': 0.5}
Best parameters for NR-Aromatase: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
Best parameters for NR-ER: {'max_depth': 6, 'gamma': 2, 'eta': 0.1}
Best parameters for NR-ER-LBD: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
Best parameters for NR-PPAR-gamma: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
Best parameters for SR-ARE: {'max_depth': 8, 'gamma': 0.5, 'eta': 0.5}
Best parameters for SR-ATAD5: {'max_depth': 7, 'gamma': 0.1, 'eta': 0.4}
