# 2.3-xgboost-toxcast.ipynb

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [2]:
# Load dataset
toxcast = pd.read_csv("../data/interim/toxcast_ecfp4.csv")

In [3]:
toxcast.head()

Unnamed: 0,smiles,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellLoss_24h_dn,APR_HepG2_CellLoss_72h_dn,APR_HepG2_MicrotubuleCSK_24h_dn,APR_HepG2_MicrotubuleCSK_24h_up,...,Tanguay_ZF_120hpf_PE_up,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up,ECFP4
0,[O-][N+](=O)C1=CC=C(Cl)C=C1,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0000000000000000000000000000000000000000000000...
1,C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C,,,,,,,,,,...,,,,,,,,,,0000000000000000000000000000000001000000000000...
2,CN1CCN(CC1)C(=O)C1CCCCC1,,,,,,,,,,...,,,,,,,,,,0010100000000010000000000000000001000000000000...
3,NC1=CC=C(C=C1)[N+]([O-])=O,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0000000000000000000000000000000000000000000000...
4,OC1=CC=C(C=C1)[N+]([O-])=O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0000000000000000000000000000000000000000000000...


In [None]:
# Create list of columns to exclude
exclude_columns = ['smiles', 'ECFP4']

# Get list of assays without excluded_columns
assays = [col for col in toxcast.columns.tolist() if col not in exclude_columns]

# Create empty dataframe
outcomes = pd.DataFrame(columns=['assay', 'auroc', 'max_depth', 'gamma', 'eta'])

# Run model on each assay
for i, assay in enumerate(assays):

    # Filter rows with NAs
    tmp = toxcast[[assay, 'ECFP4']]
    tmp = tmp.dropna()
    
    # Convert each bit of bitstring to feature
    X = tmp["ECFP4"].apply(lambda x: pd.Series(list(x))).astype(int)
    
    # Get outcome
    y = tmp[assay]
    
    # Create 80-20 test-train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the parameters for the XGBoost model
    param_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8],
        'gamma': [0.01, 0.1, 0.5, 1, 2, 5],
        'eta': [0.1, 0.2, 0.4, 0.5]
    }
    
    # Get ratio of cases to controls
    ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
    remainder = ratio % 5
    ratio -= remainder

    # Create a XGBoost classifier with appropriate weighting to positive cases
    xgb_model = xgb.XGBClassifier(eval_metric='logloss',
                                  scale_pos_weight=ratio
                                 )

    # Setup the random search with 4-fold cross validation
    random_search = RandomizedSearchCV(xgb_model, param_grid, cv=4, n_iter=20, random_state=42)

    # Carry out the random search
    random_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = random_search.best_params_
    print(f"Best parameters for {assay}: {best_params}")
    
    # Train the XGBoost model with the best parameters
    num_round = 20
    model = xgb.XGBClassifier(**best_params, eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    preds_proba = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test)

    # Calculate the AUROC score
    auroc = roc_auc_score(y_test, preds_proba)
    
    # Add AUROC and params to dataframe
    new_row = {'assay': assay, 'auroc': auroc, 'max_depth': best_params['max_depth'], 'gamma': best_params['gamma'], 'eta': best_params['eta']}
    new_row = pd.DataFrame([new_row])  # Convert the dictionary to a single-row DataFrame
    outcomes = pd.concat([outcomes, new_row], ignore_index=True)
    
    # Print progress
    print(f"{i+1} model(s) fit")
    
    # Finish after 10 models fit
    if i==10:
        break
    
# Write csv to data/interim
outcomes.to_csv('data/interim/toxcast-xgboost-outcomes.csv', index=False)

Best parameters for ACEA_T47D_80hr_Negative: {'max_depth': 6, 'gamma': 0.1, 'eta': 0.5}
0 model fit
Best parameters for ACEA_T47D_80hr_Positive: {'max_depth': 6, 'gamma': 0.1, 'eta': 0.5}
1 model fit
Best parameters for APR_HepG2_CellCycleArrest_24h_dn: {'max_depth': 6, 'gamma': 0.1, 'eta': 0.5}
2 model fit


In [None]:
# Do stability analysis to check impact of changing seeds
best_params = {'max_depth': 6, 'gamma': 0.1, 'eta': 0.5}

for i in range(5):
    
    # Train the XGBoost model with the best parameters
    num_round = 20
    model = xgb.XGBClassifier(**best_params, eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    preds_proba = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test)

    # Calculate the AUROC score
    auroc = roc_auc_score(y_test, preds_proba)
    
    # Add AUROC and params to dataframe
    new_row = {'assay': assay, 'auroc': auroc, 'max_depth': best_params['max_depth'], 'gamma': best_params['gamma'], 'eta': best_params['eta']}
    new_row = pd.DataFrame([new_row])  # Convert the dictionary to a single-row df
    outcomes = pd.concat([outcomes, new_row], ignore_index=True)