In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, text
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Load Data

In [3]:
# Connect to AWS database
database_url = f'postgresql://postgres:purifai2022@purifai.ceoinb9nwfxg.us-west-1.rds.amazonaws.com/postgres'
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect()

In [4]:
# Establish connection with engine object
with engine.connect() as conn:
    spe_analysis = conn.execute("SELECT * FROM outcomes INNER JOIN structures ON outcomes.structure_id = structures.structure_id WHERE spe_successful = 'true';")

In [5]:
# Set columns names and data contents
columns = [x for x in spe_analysis.keys()]
data = [x for x in spe_analysis]

# Create DF
spe_analysis_df = pd.DataFrame(data, columns = columns)
spe_analysis_df = spe_analysis_df.loc[:,~spe_analysis_df.columns.duplicated()].copy()
print(spe_analysis_df.shape)
spe_analysis_df.head()

(1080, 65)


Unnamed: 0,sample_id,structure_id,preferred_lcms_method,spe_method,method,spe_successful,crashed_out,sample_status,sample_current_status,termination_cause,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,00YLL22-042-002,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,00YLL22-042-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,00YLL22-042-004,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,00YLL22-042-005,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,00YLL22-042-008,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,2,0,1,1,5


In [6]:
# Remove columns not used for ML model
df = spe_analysis_df.drop(columns = ["sample_id", 
                               "preferred_lcms_method",
                               "method",
                               "spe_successful",
                               "crashed_out",
                               "sample_status",
                               "sample_current_status",
                               "termination_cause",
                               "termination_step",
                               "termination_details",
                               "reaction_scale",
                               "selected_fractions",
                               "volume_collected",
                               "total_fractions_collected",
                               "recovered_sample_dry_mass",
                               "percent_yield",
                               "percent_purity",
                               "purification_comments"])

df.head()

Unnamed: 0,structure_id,spe_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,MCX,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,MCX,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,MCX,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,MCX,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,MCX,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


In [7]:
# Check for duplicates
df.duplicated().sum()

22

In [8]:
# Drop duplicates
df = df.drop_duplicates()
print(df.shape)
df.head()

(1058, 47)


Unnamed: 0,structure_id,spe_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,MCX,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,MCX,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,MCX,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,MCX,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,MCX,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


# Define Features and Target and Split and Scale Data

In [9]:
# Create features
X = df.drop(columns = ["spe_method", "structure_id"])

# Create target
y = df["spe_method"]

In [10]:
# Check balance of target values
y.value_counts()

MCX    873
HLB    185
Name: spe_method, dtype: int64

In [11]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [12]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Best Parameter Search

### Balanced Random Forest Classifier

In [16]:
# Parameter grid for Balanced Random Forest
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ["auto", int, float, None]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {"n_estimators": n_estimators,
               "max_features": max_features,
               "max_depth": max_depth,
               "min_samples_split": min_samples_split,
               "min_samples_leaf": min_samples_leaf,
               "bootstrap": bootstrap}
dict(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'max_features': ['auto', int, float, None],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [17]:
# Use random grid to search for best hyperparameters
# Create base model to tune
brf = BalancedRandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
random_search = RandomizedSearchCV(estimator = brf, 
                                param_distributions = random_grid, 
                                n_iter = 100, 
                                scoring = "balanced_accuracy",
                                cv = 3, 
                                verbose = 3, 
                                n_jobs = -1,
                                random_state = 1)

# Fit random search
random_search.fit(X_train_scaled, y_train)
print(random_search.best_params_)
print(random_search.best_estimator_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
BalancedRandomForestClassifier(bootstrap=False, max_depth=100,
                               min_samples_split=5, n_estimators=400)


In [13]:
# Create function to evaluate model performance
def evaluate(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    cm_df = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])
    ic_report = classification_report_imbalanced(y_test, y_pred)

    # Display model performance metrics
    print(f"Balanced Accuracy Score: {ba_score}\n\n")
    print(f"Confusion Matrix:")
    display(cm_df)
    print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")
    return ba_score

In [14]:
# Performance of basic model
base_model = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
base_model.fit(X_train_scaled, y_train)
base_performance = evaluate(base_model, X_test_scaled, y_test)

Balanced Accuracy Score: 0.9009433962264151


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,48,5
Actual MCX,22,190




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.69      0.91      0.90      0.78      0.90      0.81        53
        MCX       0.97      0.90      0.91      0.93      0.90      0.81       212

avg / total       0.92      0.90      0.90      0.90      0.90      0.81       265



In [15]:
# Performance of model with best random parameters - previous best random parameter result that performed better than most recent result
best_random = BalancedRandomForestClassifier(n_estimators = 100,
                                             min_samples_split = 2,
                                             min_samples_leaf = 4,
                                             max_features = "auto",
                                             max_depth = 40,
                                             bootstrap = False,
                                             random_state = 1)
best_random.fit(X_train_scaled, y_train)
random_performance = evaluate(best_random, X_test_scaled, y_test)

Balanced Accuracy Score: 0.9080188679245282


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,49,4
Actual MCX,23,189




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.92      0.89      0.78      0.91      0.83        53
        MCX       0.98      0.89      0.92      0.93      0.91      0.82       212

avg / total       0.92      0.90      0.92      0.90      0.91      0.82       265



In [16]:
# Balanced accuracy score improvement with best random parameters
random_ba_improvement = "{:0.2f}%".format( 100 * (random_performance - base_performance) / base_performance)
print(f"Improvement of model with best random parameters: {random_ba_improvement}")

Improvement of model with best random parameters: 0.79%


In [22]:
# Create parameter grid based on results of random search
param_grid = {"bootstrap": [False],
              "max_depth": [10, 20, 30, 40, 50, 60, 70],
              "max_features": ["auto"],
              "min_samples_leaf": [3, 4, 5, 6],
              "min_samples_split": [1, 2, 3, 4],
              "n_estimators": [50, 100, 150, 200, 250, 300]}

# Create base model to tune
brf = BalancedRandomForestClassifier()

# Instantiate grid search model
grid_search = GridSearchCV(estimator = brf, 
                           param_grid = param_grid, 
                           cv = 3,
                           scoring = "balanced_accuracy",
                           n_jobs = -1,
                           verbose = 3)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 672 candidates, totalling 2016 fits
[CV 3/3] END bootstrap=False, max_depth=70, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=900;, score=0.825 total time=   2.7s
[CV 3/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1000;, score=0.818 total time=   1.7s
[CV 2/3] END bootstrap=True, max_depth=110, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=700;, score=0.848 total time=   1.2s
[CV 3/3] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1000;, score=0.832 total time=   1.6s
[CV 2/3] END bootstrap=False, max_depth=50, max_features=<class 'float'>, min_samples_leaf=4, min_samples_split=2, n_estimators=1000;, score=nan total time=   0.3s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=<class 'float'>, min_samples_leaf=1, min_samples_split=10, n_estimators=400;, score=nan total

[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s[CV 1/3] END bootstrap=False, max_depth=70, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=900;, score=0.857 total time=   2.7s
[CV 1/3] END bootstrap=True, max_depth=70, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=0.880 total time=   1.3s
[CV 3/3] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=0.834 total time=   1.5s
[CV 2/3] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1000;, score=0.843 total time=   1.6s
[CV 1/3] END bootstrap=False, max_depth=50, max_features=<class 'float'>, min_samples_leaf=4, min_samples_split=2, n_estimators=1000;, score=nan total time=   0.3s
[CV 3/3] END bootstrap=False, max_depth=50, max_features=<class 'f

[CV 3/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.839 total time=   0.2s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.845 total time=   0.3s
[CV 1/3] END bootstrap=True, max_depth=60, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100;, score=0.868 total time=   0.2s
[CV 3/3] END bootstrap=True, max_depth=60, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100;, score=0.820 total time=   0.2s
[CV 2/3] END bootstrap=True, max_depth=110, max_features=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200;, score=0.836 total time=   0.5s
[CV 2/3] END bootstrap=True, max_depth=90, max_features=<class 'int'>, min_samples_leaf=2, min_samples_split=5, n_estimators=200;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=True, max_depth=100, max_features=auto, min_sampl

[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=250;, score=0.848 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=300;, score=0.866 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=50;, score=0.832 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=100;, score=0.830 total time=   0.2s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=250;, score=0.846 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=250;, score=0.843 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_le

[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=2, n_estimators=50;, score=0.887 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=200;, score=nan total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=300;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=300;, score=nan total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=250;, score=0.868 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=250;, score=0.845 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, 

[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=4, min_samples_split=3, n_estimators=150;, score=0.830 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=200;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.839 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100;, score=0.859 total time=   0.2s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=50;, score=0.873 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=50;, score=0.839 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=3

[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=50;, score=0.816 total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=1, n_estimators=150;, score=nan total time=   0.0s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=1, n_estimators=150;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=1, n_estimators=250;, score=nan total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=5, min_s

[CV 1/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300;, score=0.871 total time=   0.5s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=200;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=200;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=2, n_estimators=50;, score=0.816 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, mi

[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=250;, score=0.832 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300;, score=0.864 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=150;, score=nan total time=   0.0s
[CV 2/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=150;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=3, mi

[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=200;, score=0.825 total time=   0.3s
[CV 3/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=250;, score=0.830 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=300;, score=0.857 total time=   0.5s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=50;, score=0.839 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=50;, score=0.836 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=4, n_estimators=150;, score=0.862 total time=   0.2s
[CV 2/3] END bootstrap=False, max_depth=10, max_features=auto, min_samples_lea

[CV 2/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300;, score=0.841 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=300;, score=0.830 total time=   0.5s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=250;, score=nan total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=1, n_estimators=250;, score=nan total time=   0.1s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=50;, score=0.845 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=2, n_estimators=100;, score=0.862 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3

[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=150;, score=0.852 total time=   0.2s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=150;, score=0.843 total time=   0.2s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_

[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.845 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.836 total time=   0.2s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=50;, score=0.850 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=50;, score=0.843 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=100;, score=0.839 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=100;, score=0.834 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_lea

[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=50;, score=0.830 total time=   0.1s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=100;, score=0.839 total time=   0.2s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=200;, score=0.841 total time=   0.3s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=200;, score=0.834 total time=   0.3s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=250;, score=0.846 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=250;, score=0.843 total time=   0.4s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_le

[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=50;, score=0.848 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=50;, score=0.836 total time=   0.1s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.845 total time=   0.3s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.845 total time=   0.3s
[CV 1/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=250;, score=0.853 total time=   0.4s
[CV 2/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=250;, score=0.843 total time=   0.4s
[CV 3/3] END bootstrap=False, max_depth=30, max_features=auto, min_samples_lea

[CV 2/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=100;, score=0.845 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=100;, score=0.836 total time=   0.2s
[CV 3/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=250;, score=0.834 total time=   0.4s
[CV 1/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=5, min_samples_split=3, n_estimators=300;, score=0.852 total time=   0.5s
[CV 3/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=50;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=6, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 1/3] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=6

[CV 1/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=200;, score=0.866 total time=   0.3s{'bootstrap': False, 'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
BalancedRandomForestClassifier(bootstrap=False, max_depth=70,
                               min_samples_leaf=4, n_estimators=50)


In [19]:
# Performance of model with best grid parameters
best_grid = BalancedRandomForestClassifier(n_estimators = 50,
                                             min_samples_split = 2,
                                             min_samples_leaf = 4,
                                             max_features = "auto",
                                             max_depth = 70,
                                             bootstrap = False,
                                             random_state = 1)
best_grid.fit(X_train_scaled, y_train)
grid_performance = evaluate(best_grid, X_test_scaled, y_test)

Balanced Accuracy Score: 0.8985849056603774


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,49,4
Actual MCX,27,185




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.64      0.92      0.87      0.76      0.90      0.81        53
        MCX       0.98      0.87      0.92      0.92      0.90      0.80       212

avg / total       0.91      0.88      0.91      0.89      0.90      0.80       265



In [20]:
# Balanced accuracy score improvement with best grid parameters
grid_ba_improvement = "{:0.2f}%".format( 100 * (grid_performance - base_performance) / base_performance)
print(f"Improvement of model with best grid parameters: {grid_ba_improvement}")

Improvement of model with best grid parameters: -0.26%


In [21]:
# Create DF comparing performance of base model to performance after random/grid search
brf_compare = {"Name": "Balanced Random Forest",
               "Base model Balanced Accuracy": base_performance,
               "Grid model Balanced Accuracy": random_performance,
               "Improvement": random_ba_improvement}
df_brf = pd.DataFrame(brf_compare, index = [0])
df_brf

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Balanced Random Forest,0.900943,0.908019,0.79%


In [22]:
# Save DF as CSV
df_brf.to_csv("df_brf.csv", index = False)

In [23]:
# Save scaler
import joblib
scaler_filename = "spe_scaler.sav"
joblib.dump(X_scaler, "spe_scaler.sav")

['spe_scaler.sav']

In [24]:
# Save model
filename = "spe_brf_model.sav"
joblib.dump(best_random, "spe_brf_model.sav")

['spe_brf_model.sav']

In [25]:
# Test successful saving
loaded_model = joblib.load("spe_brf_model.sav")
scaler = joblib.load("spe_scaler.sav")

X_test_scaled = scaler.transform(X_test)
predict = loaded_model.predict(X_test_scaled)
ba_score_loaded = balanced_accuracy_score(y_test, predict)
print(ba_score_loaded)

0.9080188679245282


In [26]:
# Determine feature importance for model using best parameters
# List features sorted in descending order by feature importance
importances = best_random.feature_importances_
feature_importance_df = pd.DataFrame(sorted(zip(importances, X.columns), reverse = True), columns = ["Importance", "Feature"])
feature_importance_df.head()

Unnamed: 0,Importance,Feature
0,0.089847,SMR_VSA10
1,0.075715,FractionCSP3
2,0.07346,TPSA
3,0.04973,MolMR
4,0.039251,NumValenceElectrons


In [27]:
# Save feature importance as CSV
feature_importance_df.to_csv("spe_brf_feature_importance.csv", index = False)