In [1]:
import os
from glob import glob
import json
from pprint import pprint
import random
import pickle
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### Define any variables that need to persist throughout

Currently rerunning from the requirements cell below when changing datasets... should update this so you can get all available for later cells

In [2]:
model_data = {
    'mmvi_binary' : {
        'dataframes' : {},
        'modeling' : {},
        'training' : {}
    },
    'knn_binary' : {
        'dataframes' : {},
        'modeling' : {},
        'training' : {}
    },
    # Uncomment below if you want multilabel
    'mmvi_multilabel' : {
        'dataframes' : {},
        'modeling' : {},
        'training' : {}
    },
    'knn_multilabel' : {
        'dataframes' : {},
        'modeling' : {},
        'training' : {}
    }
}

rep_data = model_data.copy()

### Define Some Requirements for the Fitting

Here, we define the version (date, in YYYYMMDD format) of the dataset we want to use for modeling, the imputation method for missing values ('mmvi' or 'knn'), and whether we are building a 'binary' or 'multilabel' model 

In [3]:
current_date = datetime.today().strftime('%Y%m%d')
data_date = '20230828'
smote = True
predict_replication = True

if predict_replication:
    
    # Hold out specific data points to test predictions on (in this case, the replication experiments)
    data_to_hold_out = [
        {
            'doi' : '10.1007/s10854-013-1374-0',
            'recipe_ids' : [3, 4, 5, 6]
        },
        {
            'doi' : '10.3390/ma12091444',
            'recipe_ids' : [258, 259, 260]
        }
    ]
    
    run_dir = 'for_replication_prediction'
    
else:
    data_to_hold_out = []
    run_dir = 'general'

### Load Data

Switch out "mmvi" for "knn" for different imputation methods. Note the date of the file that you are importing.

In [4]:
def create_df_modeling_fields(c, i, p):
    
    # Get dataframe and fill NA values
    if c == 'master':
        df = pd.read_csv(f'./data/bfo_df_modeling_{i}_{data_date}.csv')
    elif c == 'replication':
        df = pd.read_csv(f'./data/bfo_test_for_prediction_df_modeling_{i}_{data_date}.csv')
    else:
        df = pd.read_csv(f'./data/bfo_{c}_df_modeling_{i}_{data_date}.csv')
    
    df = df.fillna(0)
    
    # Define features dataframe
    features_df = df.drop(
        [
            'Unnamed: 0', 
            'recipe_id', 
            'impurity_code', 
            'fe_rich_indicator', 
            'bi_rich_indicator'
        ], 
        axis=1
    )
    
    features = features_df.to_numpy()
    
    # Define features list
    features_list = list(features_df.columns)
    
    # Define labels
    if p == 'binary':
        labels = df['impurity_code']

    elif pred_output == 'multilabel':
        
        g = df[['fe_rich_indicator', 'bi_rich_indicator']]
        columns = g.columns
        labels = g.astype(int).dot(columns).replace({
            "" : 0,
            "fe_rich_indicator" : 1,
            "bi_rich_indicator" : 2,
            "fe_rich_indicatorbi_rich_indicator" : 3
        }).to_numpy()
    
    return (
        df, 
        {
            'features' : features,
            'features_list' : features_list,
            'labels' : labels
        }
    )

In [5]:
for imputation_method, pred_output in [(key.split('_')[0], key.split('_')[1]) for key in model_data.keys()]:
    for coverage in ['lit']: #, 'suggested']:
        df, modeling_fields = create_df_modeling_fields(coverage, imputation_method, pred_output)
        model_data[f'{imputation_method}_{pred_output}']['dataframes'][coverage] = df
        model_data[f'{imputation_method}_{pred_output}']['modeling'][coverage] = modeling_fields

### Implement Synthesis Minority Oversampling Technique (SMOTE)

This is only applicable for binary or multiclass prediction, but not with multilabel prediction

In [6]:
def get_smote_features(Xm, Y):
    over_sample = SMOTE(random_state=512)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=512)
    columns = X_train.columns

    return over_sample.fit_resample(X_train, y_train)

    print('length of oversampled data: ', len(os_data_X))
    print('number of pure syntheses in oversampled data: ', len(os_data_y[os_data_y==0]))
    print('number of impure syntheses in oversampled data: ', len(os_data_y[os_data_y==1]))

### Define an Evaluation Function

This will output accuracy, precision, recall, and F1 score for a given model's ability to predict on the test set

In [7]:
def evaluate(model, test_features, test_labels, pred_output):
    test_labels = np.array(test_labels) + 1
    predictions = model.predict(test_features) + 1
    
    if pred_output == 'multilabel':
        test_labels[test_labels > 2] = 2
        predictions[predictions > 2] = 2
    
    errors = abs(predictions - (test_labels))
    
    report = classification_report(test_labels, predictions, output_dict=True)
    
    return report

### Define Simple Prediction Function
Get predictions for a batch of specific data points (in this case the replication attempts) 

In [8]:
def predict(model, test_features, test_labels, pred_output):
    test_labels = test_labels + 1
    predictions = model.predict(test_features) + 1
    
    if pred_output == 'multilabel':
        test_labels[test_labels > 2] = 2
        predictions[predictions > 2] = 2
    
    errors = abs(predictions - (test_labels))
    
    report = classification_report(test_labels, predictions, output_dict=True)
    
    return report

### Run trainings and get evaluation

Here, we are running 5 folds with 10 total repeats, getting 50 total models with 10 final evaluation metrics to average and take standard deviation

First, set up models and CV frameworks

In [9]:
### Include an option to track the metrics with increasing training set size. 


## Check out https://stackoverflow.com/questions/42228735/scikit-learn-gridsearchcv-with-multiple-repetitions
## for this code and discussion

base_tree = DecisionTreeClassifier()

param_dict = {
    "criterion" : ['gini', 'entropy'],
    "splitter" : ['best', 'random'],
    "max_depth" : range(4,7),
    "min_samples_split" : range(2,4),
    "min_samples_leaf" : range(1,3),
    "random_state" : [2**r for r in range(7, 13)]
}

scoring = {
    "F1" : make_scorer(f1_score, average="micro"),
    "Precision" : make_scorer(precision_score, average="micro"), 
    "Recall" : make_scorer(recall_score, average="micro"),
    "Accuracy" : make_scorer(accuracy_score)
}

cv = RepeatedStratifiedKFold(
    n_splits=5, 
    n_repeats=10
)

clf = GridSearchCV(
    estimator=base_tree, 
    param_grid=param_dict, 
    scoring=scoring,
    cv=cv,
    refit="F1",
    error_score="raise",
    verbose=0
)

Conduct training size analyses, with different random seeds for error bars

In [10]:
test_add_samples = False

In [16]:
def train_and_evaluate(
    features, 
    labels, 
    test_features, 
    test_labels, 
    pred_output,
    save=True
):
    clf.fit(features, labels)

    best_metrics = evaluate(clf.best_estimator_, test_features, test_labels, pred_output)

    best_f1 = best_metrics['macro avg']['f1-score']

    if f'{len(features)}' in model_data[f'{imputation_method}_{pred_output}']['training'].keys():
        model_data[f'{imputation_method}_{pred_output}']['training'][f'{len(features)}'].append(best_f1)
    else:
        model_data[f'{imputation_method}_{pred_output}']['training'][f'{len(features)}'] = [best_f1]
    
    if save:
        with open(f'./models/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}/best_{coverage}_{split_seed}_{max_training_samples}_{best_f1:.2f}.pkl', 'wb') as fp:
            pickle.dump(clf.best_estimator_, fp)

        with open(f'./data/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}/best_{coverage}_{split_seed}_{max_training_samples}_{best_f1:.2f}.pkl', 'w') as fp:
            json.dump(best_metrics, fp)
            
    return

In [17]:
test_size = 0.2
for coverage in ['lit']: #, 'suggested']:
    for imputation_method, pred_output in [(key.split('_')[0], key.split('_')[1]) for key in model_data.keys()]:
        
        iter_data = model_data[f'{imputation_method}_{pred_output}']['modeling'][coverage]
        features_list = iter_data['features_list']
        
        if coverage in ['master', 'suggested'] and test_add_samples:
            
            add_list_of_features = []
            add_list_of_labels = []
            
            list_of_features = list(iter_data['features'])
            list_of_labels = list(iter_data['labels'])
                       
            for idx in range(len(iter_data['features']) - 340):
                add_list_of_features.append(list_of_features.pop(-1))
                add_list_of_labels.append(list_of_labels.pop(-1))             

            add_features = np.array(add_list_of_features)
            add_labels = np.array(add_list_of_labels)
        
            features = iter_data['features'][:340]
            labels = iter_data['labels'][:340]
            
        else:
            features = iter_data['features']
            labels = iter_data['labels']
            
        if data_to_hold_out:
            held_out_features = np.empty(features.shape)
            held_out_labels = pd.DataFrame()
            
            indices_to_hold_out = [i-1 for d in data_to_hold_out for i in d['recipe_ids']]

            test_size = (test_size*len(features) - len(indices_to_hold_out)) / (len(features) - len(indices_to_hold_out))

            held_out_features = features[indices_to_hold_out] 

            features = np.delete(features, indices_to_hold_out, 0)

            labels = pd.DataFrame(labels)
            held_out_labels = labels.iloc[indices_to_hold_out]
            labels = labels.drop(index=indices_to_hold_out)
        
        else:
            held_out_features = None
            held_out_labels = None
        
        for i, split_seed in enumerate([2**r for r in range(7, 8)]):

            (
                train_features, 
                test_features, 
                train_labels, 
                test_labels
            ) = train_test_split(
                features, 
                labels, 
                test_size=test_size, 
                stratify=labels, # Include this to ensure that label balance in test set is proportionate to the full dataset 
                random_state=split_seed
            )
            
            print()
            
            if data_to_hold_out:
                if held_out_features.any():
                    test_features = np.vstack([test_features, held_out_features])

                if held_out_labels.any()[0]: # Quite weird that the indexing is needed... for some reason held_out_labels.any() is returned as another series
                    test_labels = np.concatenate([test_labels, held_out_labels])
                
            if not os.path.exists(f'./models/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}'):
                os.makedirs(f'./models/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}')
                print(f'Created directory ./models/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}')

            if not os.path.exists(f'./data/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}/'):
                os.makedirs(f'./data/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}/')
                print(f'Created directory ./data/{run_dir}/{current_date}/{imputation_method}_{pred_output}/{split_seed}/')
                
            if not os.path.exists(f'./data/{run_dir}/dt_learning_curves/{current_date}/'):
                os.makedirs(f'./data/{run_dir}/dt_learning_curves/{current_date}/')
                print(f'Created directory ./data/{run_dir}/dt_learning_curves/{current_date}/')

            for train_size in np.linspace(0.1, 1, 12):

                print(f'Learning Progress... Max Size: {int(np.ceil(train_size*100))}% of training data | Trial {i+1}/{len([2**r for r in range(7, 13)])}', end='\r')

                max_training_samples = int(np.ceil(train_size*len(train_features)))

                trial_train_features = train_features[:max_training_samples]
                trial_train_labels = train_labels[:max_training_samples]
                
                train_and_evaluate(
                    trial_train_features, 
                    trial_train_labels, 
                    test_features, 
                    test_labels,
                    pred_output
                )
                
                # If we are testing additional, suggested samples in training set,
                # then do one more training and evaluation with the additional features
                # and labels added to the training set
                if train_size == 1 and test_add_samples:
                    trial_train_add_features = np.concatenate((trial_train_features, add_features))
                    trial_train_add_labels = np.concatenate((trial_train_labels, add_labels))
                    
                    train_and_evaluate(trial_train_add_features, trial_train_add_labels, test_features, test_labels)

            saved_features = model_data[f'{imputation_method}_{pred_output}']['modeling']
            saved_training_metrics = model_data[f'{imputation_method}_{pred_output}']['training']
            
            with open(f'./data/{run_dir}/dt_learning_curves/{current_date}/{coverage}_thru_{imputation_method}_{pred_output}_{split_seed}.json', 'w') as fp:
                json.dump({
                    k : {
                        'modeling' : {
                            l : list(model_data[k]['modeling'][l]) for 
                            l in model_data[k]['modeling']
                        }, 
                        'training' : model_data[k]['training']
                    } for k in model_data.keys()}, fp)

            print('\n\n')


Learning Progress... Max Size: 10% of training data | Trial 1/6



Learning Progress... Max Size: 100% of training data | Trial 1/6



Learning Progress... Max Size: 68% of training data | Trial 1/6

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Learning Progress... Max Size: 100% of training data | Trial 1/6



Created directory ./models/for_replication_prediction/20230828/mmvi_multilabel/128
Created directory ./data/for_replication_prediction/20230828/mmvi_multilabel/128/
Learning Progress... Max Size: 10% of training data | Trial 1/6

ValueError: Found input variables with inconsistent numbers of samples: [49, 56]

### Run predictions over the replication experiments

If available, run predictions over suggested experiments to replicate published results

In [5]:
# First, grab the best performing models from each 
os.chdir('/Users/kevcruse96/PycharmProjects/bfo-impurityphase-analysis/models/20230526/')

for k in rep_data.keys():
    best_models = []
    for seed in [2**r for r in range(7, 13)]:
        max_f1 = 0
        best_split = None
        for file in list(glob(f'./{k}/{seed}/*.pkl')):
            f1 = int(file.split('.')[2])
            if f1 > max_f1:
                max_f1 = f1
                best_split = file.split('_')[4]
        best_models.append(f'./{k}/{seed}/best_lit_{seed}_{best_split}_0.{max_f1}.pkl')
    
    rep_data[k]['modeling']['best_models'] = best_models

In [7]:
from sklearn import tree
import pydotplus

coverage = 'replication'
data_date = 20230526

os.chdir('/Users/kevcruse96/PycharmProjects/bfo-impurityphase-analysis/')
mmvi_rep_df = pd.read_csv('./data/bfo_test_for_prediction_df_modeling_mmvi_20230526.csv')
knn_rep_df = pd.read_csv('./data/bfo_test_for_prediction_df_modeling_knn_20230526.csv')   


for imputation_method, pred_output in [(key.split('_')[0], key.split('_')[1]) for key in rep_data.keys()]:
    df, modeling_fields = create_df_modeling_fields(coverage, imputation_method, pred_output)
    rep_data[f'{imputation_method}_{pred_output}']['dataframes'][coverage] = df
    rep_data[f'{imputation_method}_{pred_output}']['modeling'][coverage] = modeling_fields

os.chdir('/Users/kevcruse96/PycharmProjects/bfo-impurityphase-analysis/models/20230526/')

importances = []

for k in rep_data.keys():
    print(k)
    for m in rep_data[k]['modeling']['best_models']:
        model = pickle.load(open(m, 'rb'))
        importances.append(model.feature_importances_)
        prediction = model.predict(rep_data[k]['modeling']['replication']['features'])
        print(prediction)
        
        for idx in range(0, prediction.shape[0]):

            dot_data = tree.export_graphviz(model, out_file=None,
                                            feature_names=rep_data[k]['modeling']['replication']['features_list'],
                                            class_names=['phase pure', 'phase impure', 'phase impure', 'phase_impure'],
                                            filled=True, rounded=True,
                                            special_characters=True)
            graph = pydotplus.graph_from_dot_data(dot_data)

            # empty all nodes, i.e.set color to white and number of samples to zero
            for node in graph.get_node_list():
                if node.get_attributes().get('label') is None:
                    continue
                if 'samples = ' in node.get_attributes()['label']:
                    labels = node.get_attributes()['label'].split('<br/>')
                    for i, label in enumerate(labels):
                        if label.startswith('samples = '):
                            labels[i] = 'samples = 0'
                    node.set('label', '<br/>'.join(labels))
                    node.set_fillcolor('white')

            samples = rep_data[k]['modeling']['replication']['features'][idx:idx+1]
            decision_paths = model.decision_path(samples)

            for decision_path in decision_paths:
                for n, node_value in enumerate(decision_path.toarray()[0]):
                    if node_value == 0:
                        continue
                    node = graph.get_node(str(n))[0]            
                    node.set_fillcolor('green')
                    labels = node.get_attributes()['label'].split('<br/>')
                    for i, label in enumerate(labels):
                        if label.startswith('samples = '):
                            labels[i] = 'samples = {}'.format(int(label.split('=')[1]) + 1)

                    node.set('label', '<br/>'.join(labels))

            filename = f'./{k}/exp{idx}_{m.split("_")[4]}_{m.split(".")[2]}_tree.png'
            graph.write_png(filename)

mmvi_binary
[0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 1 1 1 1 1 0 0 0 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
knn_binary
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
mmvi_multilabel
[0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
[0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
[0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[2 2 2 0 2 2 2 2 2 2 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
knn_multilabel
[1 1 1 0 0 0 0 1 1 1 0 0 0 0 1]
[1 1 1 0 0 0 0 1 1 1 0 0 0 0 1]
[0 0 0 0 1 1 1 1 1 1 0 0 0 1 1]
[2 2 2 0 2 2 2 2 2 2 0 0 0 0 0]
[0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]
[0 0 0 0 2 2 2 2 2 2 0 0 0 0 0]


In [8]:
feature_importances_tuples = zip(np.mean(importances, axis=0), rep_data[k]['modeling']['replication']['features_list'])
sorted_feature_importances_tuples = sorted(feature_importances_tuples, key=lambda x:x[0])

pprint(sorted_feature_importances_tuples)

[(0.0, 'low_coating_rpm'),
 (0.0, 'n2_atm'),
 (0.0, 'chem_pca-c3'),
 (0.0, 'chem_pca-c8'),
 (0.0, 'chem_pca-c10'),
 (0.0, 'chem_pca-c13'),
 (0.0, 'chem_pca-c20'),
 (0.0, 'chem_pca-c23'),
 (0.000473388785517213, 'age_temp_degC'),
 (0.0007008829004847653, 'chem_pca-c14'),
 (0.0008375802189505473, 'chem_pca-c9'),
 (0.0010582637453769258, 'separate_hydrolysis'),
 (0.0012918449619900513, 'chem_pca-c7'),
 (0.0014865395775896493, 'chem_pca-c15'),
 (0.0017902703161378254, 'chem_pca-c27'),
 (0.0022825728996719833, 'low_coating_time_sec'),
 (0.003576392613955483, 'chem_pca-c2'),
 (0.0035836270552499273, 'final_prebake_degC'),
 (0.003974878230918387, 'chem_pca-c17'),
 (0.005495200616404229, 'chem_pca-c30'),
 (0.0055680053432665124, 'chem_pca-c4'),
 (0.005568755431486494, 'o2_atm'),
 (0.005814839157354975, 'chem_pca-c26'),
 (0.005989263391977876, 'dry_degC'),
 (0.006287185303527304, 'final_prebake_time_min'),
 (0.007070391235210061, 'layer_prebake_degC'),
 (0.007988523971650462, 'chem_pca-c19'),
 

### Visualize a specific tree

In [None]:
print(best_f1)
# Visualize best decision tree

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
# Export the image to a dot file
export_graphviz(clf.best_estimator_, out_file = f'{current_date}_best_tree_lit_{seed}_{imputation_method}_{pred_output}.dot', feature_names = lit_feature_list, class_names = ['phase pure', 'fe-rich impurity', 'bi-rich impurity', 'both impurities'], rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file(f'{current_date}_best_tree_lit_{seed}_{imputation_method}_{pred_output}.dot')
# Write graph to a png file
graph.write_png(f'{current_date}_best_tree_lit_{seed}_{imputation_method}_{pred_output}.png')

## Scratch

In [None]:
# Evaluation function



### Define a Function for Training

Function should optionally train only using the literature dataset or both literature and suggested experiments along with the performance differences between the two models

In [None]:
def train(
    X,
    y,
    criterion='entropy', 
    max_depth=7, #12 
    min_samples_split=4, 
    min_samples_leaf=2, 
    seed=512
):
    # Since you're already stratifying, maybe just implement a k-folds CV framework? 
    
    X_train, test_features, y_train, test_labels = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed)

    if pred_output=='binary' and smote:
        X, y = get_smote_features(X_train, y_train)
        
    base_tree = DecisionTreeClassifier(
        criterion=criterion,
        splitter="best",
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=seed,
        max_features=None,
    )
    
    base_tree.fit(X_train, y_train)
    base_eval = evaluate(base_tree, test_features, test_labels)
    
    pprint(base_eval)
    print()

    return base_tree, base_eval

In [None]:
accuracy_improvements = []
f1_improvements = []

for seed in [2**j for j in range(1,100)]:
    print("Training from purely literature")
    print("===============================")
    lit_acc, lit_f1 = train_diffs(lit_features, lit_labels, seed=seed)
    print()
    print("Training with Added Data")
    print("========================")
    acc, f1 = train_diffs(features, labels, seed=seed)
    
    accuracy_improvements.append(acc - lit_acc)
    f1_improvements.append(f1 - lit_f1)
    print('\\\\\\\\\\\\\\\\\\\\\\\\\\\\')
    print('\\\\\\\\\\\\\\\\\\\\\\\\\\\\')

In [None]:
print("average improvements to accuracy: ", np.mean(accuracy_improvements))
print("average improvements to f1: ", np.mean(f1_improvements))

### Grid Search for Hyperparameters

In [None]:
param_dict = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : range(2,10),
    "min_samples_split" : range(2,10),
    "min_samples_leaf" : range(2,5)
}

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=0)

grid = GridSearchCV(
    grid_tree,
    param_grid=param_dict,
    cv=10,
    verbose=1,
)

grid.fit(train_features, train_labels)
print(grid.best_estimator_.get_params())

## Miscellaneous

In [None]:
with open('./models/20220825/best_100_0.71.pkl', 'rb') as fp:
    saved_model = pickle.load(fp)

In [None]:
for feat_name, imp in zip(feature_list, saved_model.feature_importances_.tolist()):
    print(feat_name, imp)