# FM Dataset

The FM dataset is whole animal toxicity endpoint, and represents the acute toxicity testing results against the fathead minnow


reference: https://setac.onlinelibrary.wiley.com/doi/full/10.1002/etc.5620190225


endpoint values are toxicity to Fat Minnow, -log10 of Conc.(umol/L)

## Generate rdkit continuous descriptors, splitting dataset, and descriptor preprocessing

In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
import os
currentDirectory = os.getcwd()
d = os.path.join(currentDirectory, "Datasets","FM_0801.csv")
dataset = pd.read_csv(d, index_col = 0)

molecules = [Chem.MolFromSmiles(mol) for mol in dataset.SMILES]

calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=dataset.index,
                     columns=list(calculator.GetDescriptorNames()))

train_set_X, test_set_X = train_test_split(X, test_size=0.2, random_state=42)
train_set_y = dataset.loc[train_set_X.index]['endpoint'].values
test_set_y = dataset.loc[test_set_X.index]['endpoint'].values

In [2]:
from sklearn import pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
pipeline = pipeline.Pipeline([
        ('scaling', MinMaxScaler()),
        ('std_scaler', StandardScaler()),
    ])
train_X_prepared = pipeline.fit_transform(train_set_X)
test_X_prepared = pipeline.transform(test_set_X)

## Random forest

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2],
    'n_estimators': [5, 100, 150, 200, 250, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [5]:
# Fit the grid search to the data
grid_search.fit(train_X_prepared, train_set_y)
grid_search.best_params_

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed:  8.5min finished


{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 250}

In [9]:
from sklearn.metrics import r2_score
import numpy as np
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    MAE = np.mean(errors)
    r2 = r2_score(test_labels, predictions)
    print('Model Performance')
    print('MAE: {:0.2f}.'.format(MAE))
    print('r2 = {:0.2f}.'.format(r2))
    
    return MAE

In [10]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_X_prepared, test_set_y)

Model Performance
MAE: 0.67.
r2 = 0.61.


In [11]:
best_grid = grid_search.best_estimator_
from sklearn.externals import joblib
joblib.dump(best_grid, "FM_rf_model_0806.pkl")
#my_model_loaded = joblib.load("my_model.pkl") 



['FM_rf_model_0806.pkl']

# SVM 

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
# Create the parameter grid based on the results of random search 
param_grid = {
    'kernel': ['rbf'],
    'gamma': [1e-2, 1e-3],
    'C': [1,10]}
# Create a based model
svm = SVR()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [13]:
# Fit the grid search to the data
grid_search.fit(train_X_prepared, train_set_y)
grid_search.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:    1.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.2s finished


{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

In [14]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_X_prepared, test_set_y)

Model Performance
MAE: 0.70.
r2 = 0.51.


In [15]:
best_grid = grid_search.best_estimator_
from sklearn.externals import joblib
joblib.dump(best_grid, "FM_svm_model_0806.pkl")

['FM_svm_model_0806.pkl']