# **Team Detox Capstone Project**
## -- Model Grid Searches and Preliminary Evaluations
## *Clean Notebook* (Amy) 

## Install DeepChem (and rdkit)

In [None]:
!pip install deepchem==2.7.2.dev20230209144634

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem==2.7.2.dev20230209144634
  Downloading deepchem-2.7.2.dev20230209144634-py3-none-any.whl (709 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.7/709.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit
  Downloading rdkit-2022.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9
  Downloading scipy-1.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, rdkit, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.10.1
    Uninstalling scipy-1.10.1:
      Successfully uninstalled s

In [None]:
!pip install tensorflow 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libraries

In [None]:
import pandas as pd
import numpy as np
import deepchem as dc



In [None]:
#print timestamp
from datetime import datetime
import pytz
date_now = datetime.now().date()
time_SF = datetime.now(pytz.timezone('America/Vancouver'))

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

Timestamp: 2023-04-13 22:37:15


#Train 7 Deep Learning models from DeepChem library
Main reference: https://deepchem.readthedocs.io/en/latest/api_reference/models.html

## Model #1: Multitask Classifier with Circular Fingerprints (ECFPs)

### Load, featurize and transform data: break SMILES to ECFP features

In [None]:
#Load Tox21 dataset, note this Deepchem function includes featurizer and transformer 
#This took 14s
tox21_tasks1, tox21_datasets1, transformers1 = dc.molnet.load_tox21(featurizer=dc.feat.CircularFingerprint(size=1024, radius=4))

#train test validation split
train_dataset1, valid_dataset1, test_dataset1 = tox21_datasets1
train_dataset1
#Looks like we got 6264 compounds in the training set with 1024 fingerprints/features and 12 assays/tasks



<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Grid Search for Model #1: Multitask classifer via ECFP
#### Use train and test datasets to evaluate preliminary scores

### Define function for grid search
Reference: https://deepchem.readthedocs.io/en/latest/api_reference/hyper.html

In [None]:
#Took 50s 
def model_builder(**model_params):
  dropout = model_params['dropout']
  layer_sizes = model_params['layer_sizes']
  learning_rate = model_params['learning_rate']
  model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=layer_sizes, learning_rate=learning_rate, dropouts=dropout, random_state=2) 
  return model

params = {
    'dropout':[0,0.2,0.3, 0.5],
    'layer_sizes':[[500],[1000],[1000, 1000]],
    'learning_rate':[0.0005, 0.001, 0.005]
    }
optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_mcfp, best_hyperparams_mcfp, all_results_mcfp = optimizer.hyperparam_search(params, train_dataset1, test_dataset1, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

print(best_hyperparams_mcfp)#this jumps between {'dropout': 0.5, 'layer_sizes': [500], 'learning_rate': 0.001} vs. the one shown below
all_results_mcfp

Timestamp: 2023-03-23 15:49:57
{'dropout': 0.5, 'layer_sizes': [1000, 1000], 'learning_rate': 0.0005}


{'_dropout_0_layer_sizes[500]_learning_rate_0.000500': 0.6693249557043642,
 '_dropout_0_layer_sizes[500]_learning_rate_0.001000': 0.6604961596234661,
 '_dropout_0_layer_sizes[500]_learning_rate_0.005000': 0.6614832798835203,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.000500': 0.6698910174053948,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.001000': 0.6583008045053889,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.005000': 0.6606812157049596,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.000500': 0.6683050360548455,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.001000': 0.6649326418496367,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.005000': 0.6596857650135964,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.000500': 0.6801909933320922,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.001000': 0.6747460241664737,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.005000': 0.6623401768529996,
 '_dropout_0.200000_layer_sizes[1000]_learning_rate_0.0

### Best MultitaskClassifier + ECFP model: save model on G:drive

In [None]:
#Took 10s
import timeit
start_time = timeit.default_timer()

model_mccp = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000,1000], learning_rate = 0.0005, dropouts=0.5, random_state=2, model_dir='./someDirectory/someFolder') 
model_mccp.fit(train_dataset1, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this cell at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

metrics = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification"),dc.metrics.Metric(dc.metrics.balanced_accuracy_score, np.mean, mode="classification")]

mccp_train_scores = model_mccp.evaluate(train_dataset1, metrics, transformers1)
mccp_test_scores = model_mccp.evaluate(test_dataset1, metrics, transformers1)
mccp_valid_scores = model_mccp.evaluate(valid_dataset1, metrics, transformers1)

print("Multitask Classifier + Circular Fingerprints: ")
print('Train scores: ', mccp_train_scores)
print('Test scores: ', mccp_test_scores)
print('Validation scores: ', mccp_valid_scores)

Fit time for this cell at 50 epochs:  9.887231710000378
Timestamp: 2023-03-23 15:50:35
Multitask Classifier + Circular Fingerprints: 
Train scores:  {'mean-roc_auc_score': 0.9881476687901034, 'mean-balanced_accuracy_score': 0.9598760263126862}
Test scores:  {'mean-roc_auc_score': 0.6700880389242442, 'mean-balanced_accuracy_score': 0.6063133430911144}
Validation scores:  {'mean-roc_auc_score': 0.6823168793080989, 'mean-balanced_accuracy_score': 0.6030033464587995}


### Reload saved model trained on 03/23

In [None]:
mccp_reload = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000, 1000], learning_rate = 0.0005, dropouts=0.5, random_state=2, model_dir='./someDirectory/someFolder') 
mccp_reload.restore()

#Scores are identical after running multiple times with the same reloaded model
mccp_train_scores = mccp_reload.evaluate(train_dataset1, metrics, transformers1)
mccp_test_scores = mccp_reload.evaluate(test_dataset1, metrics, transformers1)
print('Reloaded Train scores: ', mccp_train_scores)
print('Reloaded Test scores: ', mccp_test_scores)

Reloaded Train scores:  {'mean-roc_auc_score': 0.9881476687901034, 'mean-balanced_accuracy_score': 0.9598760263126862}
Reloaded Test scores:  {'mean-roc_auc_score': 0.6700880389242442, 'mean-balanced_accuracy_score': 0.6063133430911144}


## Model #2: Multitask Classifier using RDKitDescriptors  

### Load, featurize and transform data: Break SMILES to RDKit descriptors

In [None]:
#RDKitDescriptors computing WITHOUT normalization took 1m ; WITH normalization takes 4x as long 
tox21_tasks2, tox21_datasets2, transformers2 = dc.molnet.load_tox21(featurizer=dc.feat.RDKitDescriptors(is_normalized=False))

train_dataset2, valid_dataset2, test_dataset2 = tox21_datasets2
train_dataset2



<DiskDataset X.shape: (6264, 208), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Fix NaNs and Infinities in the featurized data

In [None]:
#Fix NaNs and Infinities - these should've been fixed by the featurizer but seems to have problems via DeepChem 
#We will define a function to fix them manually

def fill_infs_nans (split_dataset):
  df = pd.DataFrame(split_dataset)
  df = df.replace([np.inf, -np.inf], np.nan)
  df = df.fillna(0)
  return df
print(np.isnan(train_dataset2.X).any())
print(~np.isfinite(train_dataset2.X).any())

#Need to fix NaN in trai, test and validation sets
train_dataset2_f = dc.data.DiskDataset.from_numpy(fill_infs_nans(train_dataset2.X),train_dataset2.y, train_dataset2.w, tasks=tox21_tasks2)
test_dataset2_f = dc.data.DiskDataset.from_numpy(fill_infs_nans(test_dataset2.X),test_dataset2.y, test_dataset2.w, tasks=tox21_tasks2)
valid_dataset2_f = dc.data.DiskDataset.from_numpy(fill_infs_nans(valid_dataset2.X), valid_dataset2.y, valid_dataset2.w, tasks=tox21_tasks2)
#No more NAN in training data
print(np.isnan(train_dataset2_f.X).any())
print(~np.isfinite(train_dataset2_f.X).any())

True
False
False
False


### Grid Search for Model #2: Multitask classifer via RDKit Descriptors

In [None]:
#took 40s
def model_builder(**model_params):
  dropout = model_params['dropout']
  layer_sizes = model_params['layer_sizes']
  learning_rate = model_params['learning_rate']
  model = dc.models.MultitaskClassifier(n_tasks=12, n_features=208, layer_sizes=layer_sizes, learning_rate=learning_rate, dropouts=dropout, random_state=2) 
  return model

params = {
    'dropout':[0, 0.2, 0.3, 0.5],
    'layer_sizes':[[500],[1000],[1000, 1000]],
    'learning_rate':[0.0005, 0.001, 0.005]
    }

optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_mcrd, best_hyperparams_mcrd, all_results_mcrd = optimizer.hyperparam_search(params, train_dataset2_f, test_dataset2_f, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_mcrd)
all_results_mcrd

Timestamp: 2023-03-23 15:57:12
{'dropout': 0.2, 'layer_sizes': [1000, 1000], 'learning_rate': 0.0005}


{'_dropout_0_layer_sizes[500]_learning_rate_0.000500': 0.6997384659569866,
 '_dropout_0_layer_sizes[500]_learning_rate_0.001000': 0.6932999895926075,
 '_dropout_0_layer_sizes[500]_learning_rate_0.005000': 0.6882564890518176,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.000500': 0.6877138438049598,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.001000': 0.6994893793263107,
 '_dropout_0_layer_sizes[1000]_learning_rate_0.005000': 0.6797793315545344,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.000500': 0.7042357516938753,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.001000': 0.7093458811796768,
 '_dropout_0_layer_sizes[1000, 1000]_learning_rate_0.005000': 0.6807536166890961,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.000500': 0.7093130421469976,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.001000': 0.7049693578767614,
 '_dropout_0.200000_layer_sizes[500]_learning_rate_0.005000': 0.6625341000376629,
 '_dropout_0.200000_layer_sizes[1000]_learning_rate_0.0

### Best MultitaskClassifier + RDD model: save model on G:drive

In [None]:
start_time = timeit.default_timer()

model_mcrd = dc.models.MultitaskClassifier(n_tasks=12, n_features=208, layer_sizes=[1000, 1000], learning_rate = 0.0005, dropouts=0.2, random_state=2, model_dir='./someDirectory/someFolder') 
model_mcrd.fit(train_dataset2_f, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this cell at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

metrics = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification"),dc.metrics.Metric(dc.metrics.balanced_accuracy_score, np.mean, mode="classification")]

train_scores_mcrd = model_mcrd.evaluate(train_dataset2_f, metrics, transformers2)
test_scores_mcrd = model_mcrd.evaluate(test_dataset2_f, metrics, transformers2)
valid_scores_mcrd = model_mcrd.evaluate(valid_dataset2_f, metrics, transformers2)

print("Multitask Classifier + RDKit Descriptors:")
print('Train auc: ', train_scores_mcrd)
print('Test auc: ', test_scores_mcrd)
print('Validation auc: ', valid_scores_mcrd)

Fit time for this cell at 50 epochs:  7.329311847999634
Timestamp: 2023-03-23 15:59:16
Multitask Classifier + RDKit Descriptors:
Train auc:  {'mean-roc_auc_score': 0.9567101431839092, 'mean-balanced_accuracy_score': 0.8985073352225194}
Test auc:  {'mean-roc_auc_score': 0.7176311177288298, 'mean-balanced_accuracy_score': 0.6479214345498941}
Validation auc:  {'mean-roc_auc_score': 0.7310291550379353, 'mean-balanced_accuracy_score': 0.6632821535341712}


## Model #3: GraphConv Model with ConvMolFeaturizer

References: 

https://deepchem.readthedocs.io/en/latest/get_started/examples.html
https://notebook.community/miaecle/deepchem/examples/notebooks/graph_convolutional_networks_for_tox21_on_colab

### Load, featurize and transform data: Break SMILES to ConvMol features

In [None]:
tox21_tasks3, tox21_datasets3, transformers3 = dc.molnet.load_tox21(featurizer=dc.feat.ConvMolFeaturizer())
train_dataset3, valid_dataset3, test_dataset3 = tox21_datasets3
train_dataset3



<DiskDataset X.shape: (6264,), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Grid Search for Model #3: GraphConv Model

In [None]:
#took 11 min
def model_builder(**model_params):
  dropout = model_params['dropout']
  batch_size = model_params['batch_size']
  learning_rate = model_params['learning_rate']
  model = dc.models.GraphConvModel(12, batch_size=batch_size, mode='classification', dropout=dropout, learning_rate = learning_rate, random_state=2)
  return model

params = {
    'dropout':[0,0.2,0.4],
    'batch_size':[50,100],
    'learning_rate':[0.0005, 0.001]
    }
optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_gc,best_hyperparams_gc, all_results_gc = optimizer.hyperparam_search(params, train_dataset3, test_dataset3, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_gc)
all_results_gc



Timestamp: 2023-03-31 12:43:13
{'dropout': 0, 'batch_size': 100, 'learning_rate': 0.001}


{'_batch_size_50_dropout_0_learning_rate_0.000500': 0.6945903711014921,
 '_batch_size_50_dropout_0_learning_rate_0.001000': 0.6790927998136542,
 '_batch_size_100_dropout_0_learning_rate_0.000500': 0.689944216057357,
 '_batch_size_100_dropout_0_learning_rate_0.001000': 0.6948445331093054,
 '_batch_size_50_dropout_0.200000_learning_rate_0.000500': 0.6608902301534528,
 '_batch_size_50_dropout_0.200000_learning_rate_0.001000': 0.6902826635842213,
 '_batch_size_100_dropout_0.200000_learning_rate_0.000500': 0.6883215288618537,
 '_batch_size_100_dropout_0.200000_learning_rate_0.001000': 0.6838534078060906,
 '_batch_size_50_dropout_0.400000_learning_rate_0.000500': 0.6715296263469268,
 '_batch_size_50_dropout_0.400000_learning_rate_0.001000': 0.6692836891998519,
 '_batch_size_100_dropout_0.400000_learning_rate_0.000500': 0.6641911681453104,
 '_batch_size_100_dropout_0.400000_learning_rate_0.001000': 0.6678657355269183}

### Best GraphConv Model: save model on G:drive

In [None]:
start_time = timeit.default_timer()

model_gc = dc.models.GraphConvModel(12, mode='classification', dropout=0, batch_size=50, number_atom_features = 75, learning_rate = 0.001, random_state=2,
                                    model_dir="./someDirectory/someFolder")
model_gc.fit(train_dataset3, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

metrics = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification"),dc.metrics.Metric(dc.metrics.balanced_accuracy_score, np.mean, mode="classification")]

train_scores_gc = model_gc.evaluate(train_dataset3, metrics, transformers3)
test_scores_gc = model_gc.evaluate(test_dataset3, metrics, transformers3)
valid_scores_gc = model_gc.evaluate(valid_dataset3, metrics, transformers3)

print("GraphConv Model")
print('Train auc: ', train_scores_gc)
print('Test auc: ', test_scores_gc)
print('Validation auc: ', valid_scores_gc)

Fit time for this fitting at 50 epochs:  149.998269273
Timestamp: 2023-03-23 16:34:47
GraphConv Model
Train auc:  {'mean-roc_auc_score': 0.9652034019131875, 'mean-balanced_accuracy_score': 0.9108551966954592}
Test auc:  {'mean-roc_auc_score': 0.7015209121994253, 'mean-balanced_accuracy_score': 0.6369161772460478}
Validation auc:  {'mean-roc_auc_score': 0.7309163160714919, 'mean-balanced_accuracy_score': 0.6461992766856727}


## Model #4: Graph Convolution Networks (GCN) model with MolGraphConvFeaturizer 


### Must install the two libraries below for GCN, GAT and ATP to work

In [None]:
!pip install  dgl -f https://data.dgl.ai/wheels/cu117/repo.html
!pip install dgllife

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/cu117/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu117/dgl-1.0.1%2Bcu117-cp39-cp39-manylinux1_x86_64.whl (266.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.0.1+cu117
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgllife
  Downloading dgllife-0.3.2-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgllife
Successfully installed dgllife-0.3.2


### Load, featurize and transform data: Break SMILES to MolGraphConv features

In [None]:
tox21_tasks4, tox21_datasets4, transformers4 = dc.molnet.load_tox21(featurizer=dc.feat.MolGraphConvFeaturizer())
train_dataset4, valid_dataset4, test_dataset4 = tox21_datasets4
train_dataset4

  return np.asarray(features)


<DiskDataset X.shape: (6252,), y.shape: (6252, 12), w.shape: (6252, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Grid Search for Model #4: GCN

In [None]:
#Took 9 min
def model_builder(**model_params):
  dropout = model_params['dropout']
  batch_size = model_params['batch_size']
  learning_rate = model_params['learning_rate']
  model = dc.models.GCNModel(len(tox21_tasks4), batch_size=batch_size, mode='classification', learning_rate=learning_rate, dropout=dropout, random_state=2)
  return model

params = {
    'dropout':[0,0.2,0.4],
    'batch_size':[50,100],
    'learning_rate':[0.0005,0.001]
    }

optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_gcn, best_hyperparams_gcn, all_results_gcn = optimizer.hyperparam_search(params, train_dataset4, test_dataset4, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_gcn)
all_results_gcn

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)
Timestamp: 2023-03-23 16:50:44
{'dropout': 0, 'batch_size': 50, 'learning_rate': 0.0005}


{'_batch_size_50_dropout_0_learning_rate_0.000500': 0.7233308807462834,
 '_batch_size_50_dropout_0_learning_rate_0.001000': 0.7042873395073977,
 '_batch_size_100_dropout_0_learning_rate_0.000500': 0.7061973078044973,
 '_batch_size_100_dropout_0_learning_rate_0.001000': 0.7157140032670458,
 '_batch_size_50_dropout_0.200000_learning_rate_0.000500': 0.7051157402532708,
 '_batch_size_50_dropout_0.200000_learning_rate_0.001000': 0.7165027600870749,
 '_batch_size_100_dropout_0.200000_learning_rate_0.000500': 0.7152434398508499,
 '_batch_size_100_dropout_0.200000_learning_rate_0.001000': 0.7186074615477588,
 '_batch_size_50_dropout_0.400000_learning_rate_0.000500': 0.6999171607991862,
 '_batch_size_50_dropout_0.400000_learning_rate_0.001000': 0.7191497254368827,
 '_batch_size_100_dropout_0.400000_learning_rate_0.000500': 0.7153264986430288,
 '_batch_size_100_dropout_0.400000_learning_rate_0.001000': 0.7005504675377826}

### Best GCN Model: save model on G:drive

#### We can write a function to score tuned model and print the scores

In [None]:
#Write function to score tuned model and print auc and balanced accuracy scores
def score_tox_model(best_model, train, test, valid, transformers):
  metrics = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification"),
             dc.metrics.Metric(dc.metrics.balanced_accuracy_score, np.mean, mode="classification")]
  train_scores = best_model.evaluate(train, metrics, transformers)
  test_scores = best_model.evaluate(test, metrics, transformers)
  valid_scores = best_model.evaluate(valid, metrics, transformers)

  print(best_model)
  print('Train auc: ', train_scores)
  print('Test auc: ', test_scores)
  print('Validation auc: ', valid_scores)

In [None]:
import timeit
start_time = timeit.default_timer()

model_gcn = dc.models.GCNModel(12, mode='classification', batch_size=50, dropout=0.0, learning_rate=0.0005, random_state=2, 
                               model_dir="./someDirectory/someFolder")
model_gcn.fit(train_dataset4, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

print("GCN Model")
score_tox_model(model_gcn, train_dataset4, test_dataset4, valid_dataset4, transformers4)

Fit time for this fitting at 50 epochs:  220.50128360099916
Timestamp: 2023-03-23 17:11:40
GCN Model
<deepchem.models.torch_models.gcn.GCNModel object at 0x7f054efa45b0>
Train auc:  {'mean-roc_auc_score': 0.9306676538018488, 'mean-balanced_accuracy_score': 0.8587542280628616}
Test auc:  {'mean-roc_auc_score': 0.713228243515204, 'mean-balanced_accuracy_score': 0.661032683957013}
Validation auc:  {'mean-roc_auc_score': 0.7547775297749807, 'mean-balanced_accuracy_score': 0.6857672018196799}


## Model #5: Graph Attention Networks (GAT) Model via MolGraphConv features

### Grid Search for Model #5: GAT

In [None]:
# Took 14 min
def model_builder(**model_params):
  dropout = model_params['dropout']
  batch_size = model_params['batch_size']
  learning_rate = model_params['learning_rate']
  model = dc.models.GATModel(12, batch_size=batch_size, mode='classification', dropout=dropout,learning_rate=learning_rate,random_state=2)
  return model

params = {
    'dropout':[0,0.2,0.4],
    'batch_size':[50,100],
    'learning_rate':[0.0005,0.001,0.005]
    }
optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_gat, best_hyperparams_gat, all_results_gat = optimizer.hyperparam_search(params, train_dataset4, test_dataset4, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_gat)
all_results_gat

Timestamp: 2023-03-23 17:29:39
{'dropout': 0, 'batch_size': 50, 'learning_rate': 0.005}


{'_batch_size_50_dropout_0_learning_rate_0.000500': 0.7107419870024286,
 '_batch_size_50_dropout_0_learning_rate_0.001000': 0.6804692779528773,
 '_batch_size_50_dropout_0_learning_rate_0.005000': 0.7167684702619966,
 '_batch_size_100_dropout_0_learning_rate_0.000500': 0.7065308800175396,
 '_batch_size_100_dropout_0_learning_rate_0.001000': 0.6933929052658355,
 '_batch_size_100_dropout_0_learning_rate_0.005000': 0.6997381982099814,
 '_batch_size_50_dropout_0.200000_learning_rate_0.000500': 0.679635092141817,
 '_batch_size_50_dropout_0.200000_learning_rate_0.001000': 0.7003141139655656,
 '_batch_size_50_dropout_0.200000_learning_rate_0.005000': 0.6874852610768244,
 '_batch_size_100_dropout_0.200000_learning_rate_0.000500': 0.6841818488148764,
 '_batch_size_100_dropout_0.200000_learning_rate_0.001000': 0.6917325597275488,
 '_batch_size_100_dropout_0.200000_learning_rate_0.005000': 0.7027826667406467,
 '_batch_size_50_dropout_0.400000_learning_rate_0.000500': 0.6806624469662097,
 '_batch_s

### Best GAT Model: save model on G:drive

In [None]:
start_time = timeit.default_timer()

model_gat = dc.models.GATModel(12, mode='classification', batch_size=50, learning_rate=0.005, dropout=0.0, random_state=2,
                               model_dir = "./someDirectory/someFolder")
model_gat.fit(train_dataset4, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

print("GAT Model: ")
score_tox_model(model_gat, train_dataset4, test_dataset4, valid_dataset4, transformers4)

Fit time for this fitting at 50 epochs:  282.15603828299936
Timestamp: 2023-03-23 17:42:48
GAT Model: 
<deepchem.models.torch_models.gat.GATModel object at 0x7f04e2272040>
Train auc:  {'mean-roc_auc_score': 0.878376491996364, 'mean-balanced_accuracy_score': 0.8042491565662258}
Test auc:  {'mean-roc_auc_score': 0.7152939212214829, 'mean-balanced_accuracy_score': 0.652446431314727}
Validation auc:  {'mean-roc_auc_score': 0.7553473159156949, 'mean-balanced_accuracy_score': 0.6840669322700438}


##Model #6: AttentiveFPModel

### Load, featurize and transform data: Break SMILES to MolGraphConv features WITH EDGES

In [None]:
#Use edges for featurizer, took 1min
tox21_tasks4b, tox21_datasets4b, transformers4b = dc.molnet.load_tox21(featurizer=dc.feat.MolGraphConvFeaturizer(use_edges=True))
train_dataset4b, valid_dataset4b, test_dataset4b = tox21_datasets4b
train_dataset4b

  return np.asarray(features)


<DiskDataset X.shape: (6249,), y.shape: (6249, 12), w.shape: (6249, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Grid Search for Model #6: AFP

In [None]:
#Took 23 min
def model_builder(**model_params):
  dropout = model_params['dropout']
  batch_size = model_params['batch_size']
  learning_rate = model_params['learning_rate']
  model = dc.models.AttentiveFPModel(12, batch_size=batch_size,mode='classification',dropout=dropout,learning_rate=learning_rate,random_state=2)
  return model

params = {
    'dropout':[0,0.2,0.4],
    'batch_size':[25,50,100],
    'learning_rate':[0.0005,0.001]
    }
optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_afp, best_hyperparams_afp, all_results_afp = optimizer.hyperparam_search(params, train_dataset4b, test_dataset4b, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_afp)
all_results_afp

Timestamp: 2023-03-23 18:06:57
{'dropout': 0.4, 'batch_size': 100, 'learning_rate': 0.0005}


{'_batch_size_25_dropout_0_learning_rate_0.000500': 0.7177609512562583,
 '_batch_size_25_dropout_0_learning_rate_0.001000': 0.7220399345910384,
 '_batch_size_50_dropout_0_learning_rate_0.000500': 0.7204011381644152,
 '_batch_size_50_dropout_0_learning_rate_0.001000': 0.725402344919396,
 '_batch_size_100_dropout_0_learning_rate_0.000500': 0.7282751662475663,
 '_batch_size_100_dropout_0_learning_rate_0.001000': 0.7265430474468193,
 '_batch_size_25_dropout_0.200000_learning_rate_0.000500': 0.7286629298175983,
 '_batch_size_25_dropout_0.200000_learning_rate_0.001000': 0.7171444043348928,
 '_batch_size_50_dropout_0.200000_learning_rate_0.000500': 0.7293089917738461,
 '_batch_size_50_dropout_0.200000_learning_rate_0.001000': 0.7039482860772509,
 '_batch_size_100_dropout_0.200000_learning_rate_0.000500': 0.728722369409045,
 '_batch_size_100_dropout_0.200000_learning_rate_0.001000': 0.723364995625765,
 '_batch_size_25_dropout_0.400000_learning_rate_0.000500': 0.7214136200330667,
 '_batch_size_

### Best AttentiveFP Model: saved on G:drive

In [None]:
start_time = timeit.default_timer()

model_afp = dc.models.AttentiveFPModel(n_tasks=12, mode='classification', dropout=0.4, batch_size=100, learning_rate=0.0005, random_state=2,
                                       model_dir = "./someDirectory/someFolder")
model_afp.fit(train_dataset4b, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

print("AFP Model: ")
score_tox_model(model_afp, train_dataset4b, test_dataset4b, valid_dataset4b, transformers4b)

Fit time for this fitting at 50 epochs:  268.19820892700045
Timestamp: 2023-03-23 18:14:09
AFP Model: 
<deepchem.models.torch_models.attentivefp.AttentiveFPModel object at 0x7f04e87ffac0>
Train auc:  {'mean-roc_auc_score': 0.9597742738378705, 'mean-balanced_accuracy_score': 0.9132702614288238}
Test auc:  {'mean-roc_auc_score': 0.7349533419945021, 'mean-balanced_accuracy_score': 0.6790515926855938}
Validation auc:  {'mean-roc_auc_score': 0.7416443872723001, 'mean-balanced_accuracy_score': 0.6925993286369022}


## Model #7: Weave Model 


### Load, featurize and transform data using WeaveFeaturizer

In [None]:
#Took 1 min
tox21_tasks5, tox21_datasets5, transformers5 = dc.molnet.load_tox21(featurizer=dc.feat.WeaveFeaturizer())
train_dataset5, valid_dataset5, test_dataset5 = tox21_datasets5
train_dataset5



<DiskDataset X.shape: (6264,), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

### Note: we have to set batch_normalize_kwargs={'trainable': False}, else grid search would return NaNs 

In [None]:
#29 min
def model_builder(**model_params):
  dropout = model_params['dropout']
  batch_size = model_params['batch_size']
  learning_rate = model_params['learning_rate']
  model = dc.models.WeaveModel(12, mode='classification',
                               batch_normalize_kwargs={'trainable': False}, batch_size=batch_size,
                               dropout=dropout,learning_rate=learning_rate,random_state=2)
  return model

params = {
    'dropout':[0,0.2],
    'batch_size':[50,100],
    'learning_rate':[0.0005,0.001]
    }
optimizer = dc.hyper.GridHyperparamOpt(model_builder)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification")

best_model_weave, best_hyperparams_weave, all_results_weave = optimizer.hyperparam_search(params, train_dataset5, test_dataset5, metric)

print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))
print(best_hyperparams_weave)
all_results_weave



Timestamp: 2023-03-25 15:44:39
{'dropout': 0.2, 'batch_size': 50, 'learning_rate': 0.0005}


{'_batch_size_50_dropout_0_learning_rate_0.000500': 0.5,
 '_batch_size_50_dropout_0_learning_rate_0.001000': 0.5,
 '_batch_size_100_dropout_0_learning_rate_0.000500': 0.5313844642547073,
 '_batch_size_100_dropout_0_learning_rate_0.001000': 0.5,
 '_batch_size_50_dropout_0.200000_learning_rate_0.000500': 0.542247644387318,
 '_batch_size_50_dropout_0.200000_learning_rate_0.001000': 0.5,
 '_batch_size_100_dropout_0.200000_learning_rate_0.000500': 0.5,
 '_batch_size_100_dropout_0.200000_learning_rate_0.001000': 0.5}

In [None]:
#18 min
import timeit
start_time = timeit.default_timer()

model_weave = dc.models.WeaveModel(n_tasks=12, mode='classification', dropout=0.2, learning_rate=0.0005, 
                                   batch_size = 50, batch_normalize_kwargs={'trainable': False}, 
                                   random_state=2, model_dir = "./someDirectory/someFolder")
model_weave.fit(train_dataset5, nb_epoch=50)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting at 50 epochs: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))



Fit time for this fitting at 50 epochs:  1086.8763577020004
Timestamp: 2023-03-25 16:06:57


In [None]:
score_tox_model(model_weave, train_dataset5, test_dataset5, valid_dataset5, transformers5)

<deepchem.models.graph_models.WeaveModel object at 0x7f3b3e9b1370>
Train auc:  {'mean-roc_auc_score': 0.5, 'mean-balanced_accuracy_score': 0.5}
Test auc:  {'mean-roc_auc_score': 0.5, 'mean-balanced_accuracy_score': 0.5}
Validation auc:  {'mean-roc_auc_score': 0.5, 'mean-balanced_accuracy_score': 0.5}


## Train a Dummy Classifer and 2 Machine Learning models from Sci-kit Learn for Comparison
## Using RDKit descriptors as features for training

### Import libraries

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

### Define scoring function for sklearn models

In [None]:
def score_sklearn_model(best_model, X_train, y_train, X_test, y_test, X_valid, y_valid):
  y_train_pred = best_model.predict(X_train)
  y_test_pred = best_model.predict(X_test)
  y_valid_pred = best_model.predict(X_valid)

  train_auc_score = roc_auc_score(y_train, y_train_pred)
  train_balanced_accuracy_score= accuracy_score(y_train, y_train_pred)

  test_auc_score = roc_auc_score(y_test, y_test_pred)
  test_balanced_accuracy_score= accuracy_score(y_test, y_test_pred)

  valid_auc_score = roc_auc_score(y_valid, y_valid_pred)
  valid_balanced_accuracy_score= accuracy_score(y_valid, y_valid_pred)

  print(best_model)
  print("Train AUC score: ", train_auc_score)
  print("Train accuracy score: ", train_balanced_accuracy_score)
  print("Test AUC score: ", test_auc_score)
  print("Test accuracy score: ", test_balanced_accuracy_score)
  print("Valid AUC score: ", valid_auc_score)
  print("Valid accuracy score: ", valid_balanced_accuracy_score)

## Train a Dummy Classifier

In [None]:
#convert train_dataset2.X back to numpy array
X_train_rdd=pd.DataFrame(train_dataset2_f.X).to_numpy()
X_test_rdd=pd.DataFrame(test_dataset2_f.X).to_numpy()
X_valid_rdd=pd.DataFrame(valid_dataset2_f.X).to_numpy()

y_train_rdd = train_dataset2_f.y
y_test_rdd = test_dataset2_f.y
y_valid_rdd = valid_dataset2_f.y

### Grid Search for Dummy Classifier

In [None]:
grid_values={'strategy':['most_frequent', 'prior', 'stratified', 'uniform', 'constant']}

dummy_clf = DummyClassifier(random_state=2,constant=1)
dummy_grid = GridSearchCV(dummy_clf, param_grid = grid_values, cv=3, scoring='roc_auc', n_jobs=-1)
dummy_grid.fit(X_train_rdd, y_train_rdd)

print(dummy_grid.best_score_)
print(dummy_grid.best_estimator_)
print(dummy_grid.best_params_)

dummy_params =dummy_grid.cv_results_['params'] 
dummy_mean_scores =dummy_grid.cv_results_['mean_test_score']

df_grid_dummy=pd.DataFrame(
    {'params': dummy_params,
     'roc_auc':dummy_mean_scores
    })
df_grid_dummy

0.5041999235219068
DummyClassifier(constant=1, random_state=2, strategy='stratified')
{'strategy': 'stratified'}


3 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/dummy.py", line 203, in fit
    raise ValueError(
ValueError: Constant target value should have shape (12, 1).



Unnamed: 0,params,roc_auc
0,{'strategy': 'most_frequent'},0.5
1,{'strategy': 'prior'},0.5
2,{'strategy': 'stratified'},0.5042
3,{'strategy': 'uniform'},0.5
4,{'strategy': 'constant'},


### Best Dummy Classifier

In [None]:
start_time = timeit.default_timer()

dummy_bestmodel = DummyClassifier(constant=1, random_state=2, strategy='stratified')
dummy_bestmodel_fitted = dummy_bestmodel.fit(X_train_rdd, y_train_rdd)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

score_sklearn_model(dummy_bestmodel_fitted, X_train_rdd, y_train_rdd, X_test_rdd, y_test_rdd, X_valid_rdd, y_valid_rdd)

Fit time for this fitting:  0.005295974002365256
Timestamp: 2023-03-23 20:39:51
DummyClassifier(constant=1, random_state=2, strategy='stratified')
Train AUC score:  0.4983335844435275
Train accuracy score:  0.3173690932311622
Test AUC score:  0.49936291438076025
Test accuracy score:  0.2869897959183674
Valid AUC score:  0.5025353893113206
Valid accuracy score:  0.29246487867177523


## Try a Random Forest Classifier

### Grid Search for Random Forest

In [None]:
grid_values={'n_estimators': [100, 200, 300],
             'max_features': ['sqrt','log2', None],
             'max_depth':[10, 50, 100]}

rfc= RandomForestClassifier(random_state=2)
rfc_grid = GridSearchCV(rfc, param_grid = grid_values, cv=3, scoring='roc_auc', n_jobs=-1)
rfc_grid.fit(X_train_rdd, y_train_rdd)

In [None]:
print(rfc_grid.best_score_)
print(rfc_grid.best_estimator_)
print(rfc_grid.best_params_)

rfc_params =rfc_grid.cv_results_['params'] 
rfc_mean_scores =rfc_grid.cv_results_['mean_test_score']

df_grid_rfc=pd.DataFrame(
    {'params': rfc_params,
     'roc_auc': rfc_mean_scores
    })
df_grid_rfc

0.7565303213315273
RandomForestClassifier(max_depth=10, n_estimators=300, random_state=2)
{'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 300}


Unnamed: 0,params,roc_auc
0,"{'max_depth': 10, 'max_features': 'sqrt', 'n_e...",0.750804
1,"{'max_depth': 10, 'max_features': 'sqrt', 'n_e...",0.754635
2,"{'max_depth': 10, 'max_features': 'sqrt', 'n_e...",0.75653
3,"{'max_depth': 10, 'max_features': 'log2', 'n_e...",0.741837
4,"{'max_depth': 10, 'max_features': 'log2', 'n_e...",0.749395
5,"{'max_depth': 10, 'max_features': 'log2', 'n_e...",0.753006
6,"{'max_depth': 10, 'max_features': None, 'n_est...",0.726987
7,"{'max_depth': 10, 'max_features': None, 'n_est...",0.732805
8,"{'max_depth': 10, 'max_features': None, 'n_est...",0.733958
9,"{'max_depth': 50, 'max_features': 'sqrt', 'n_e...",0.737478


### Best Random Forest model

In [None]:
start_time = timeit.default_timer()

rfc= RandomForestClassifier(max_depth=10, n_estimators=300, max_features = 'sqrt', random_state=2)
rfc_bestmodel_fitted = rfc.fit(X_train_rdd, y_train_rdd)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

score_sklearn_model(rfc_bestmodel_fitted, X_train_rdd, y_train_rdd, X_test_rdd, y_test_rdd, X_valid_rdd, y_valid_rdd)

Fit time for this fitting:  12.03827597399868
Timestamp: 2023-03-23 20:40:52
RandomForestClassifier(max_depth=10, n_estimators=300, random_state=2)
Train AUC score:  0.6525616580422157
Train accuracy score:  0.7008301404853129
Test AUC score:  0.5344320534853019
Test accuracy score:  0.5637755102040817
Valid AUC score:  0.542362105803081
Valid accuracy score:  0.5696040868454662


## Try K Nearest Neighbor (KNN)

### Grid Search for KNN

In [None]:
grid_params = {'n_neighbors':[3, 5, 7],
               'weights':['uniform', 'distance'],
               'metric':['euclidean','manhattan']}

knn_grid = GridSearchCV(KNeighborsClassifier(), grid_params, verbose=1, cv=3, scoring='roc_auc')
knn_grid.fit(X_train_rdd, y_train_rdd)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
print(knn_grid.best_score_)
print(knn_grid.best_estimator_)
print(knn_grid.best_params_)

knn_params =knn_grid.cv_results_['params'] 
knn_mean_scores =knn_grid.cv_results_['mean_test_score']

df_grid_knn=pd.DataFrame(
    {'params': knn_params,
     'roc_auc': knn_mean_scores
    })
df_grid_knn

0.6551528942064407
KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')
{'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}


Unnamed: 0,params,roc_auc
0,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.606032
1,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.607623
2,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.62218
3,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.624383
4,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.633738
5,"{'metric': 'euclidean', 'n_neighbors': 7, 'wei...",0.63642
6,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",0.620686
7,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",0.621612
8,"{'metric': 'manhattan', 'n_neighbors': 5, 'wei...",0.639259
9,"{'metric': 'manhattan', 'n_neighbors': 5, 'wei...",0.640857


### Best KNN model

In [None]:
start_time = timeit.default_timer()

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')
knn_bestmodel_fitted = knn.fit(X_train_rdd, y_train_rdd)

elapsed = timeit.default_timer() - start_time
print("Fit time for this fitting: ", elapsed)
print("Timestamp:", date_now, datetime.now(pytz.timezone('America/Vancouver')).strftime("%H:%M:%S"))

score_sklearn_model(knn_bestmodel_fitted, X_train_rdd, y_train_rdd, X_test_rdd, y_test_rdd, X_valid_rdd, y_valid_rdd)

Fit time for this fitting:  0.014612596998631489
Timestamp: 2023-03-23 21:10:38
KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')
Train AUC score:  0.9946913340280212
Train accuracy score:  0.9947318007662835
Test AUC score:  0.5566939941599195
Test accuracy score:  0.5318877551020408
Valid AUC score:  0.5735519124306631
Valid accuracy score:  0.5363984674329502


# End of Notebook