In [1]:
import json
from pprint import pprint
import random
from collections import Counter
import pandas as pd
from material_parser.core.material_parser import MaterialParserBuilder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load data

df = pd.read_csv('./data/bfo_df_codified_20211123.csv')
df = df.sample(frac=1).reset_index(drop=True)

# WARNING: EDIT THIS (among other things)
df = df.fillna(0)

features_df = df.drop(['impurity_code', 'recipe_id'], axis=1)
labels = df['impurity_code']

feature_list = list(features_df.columns)
print(feature_list)
features = features_df.to_numpy()
labels = labels.to_numpy()

['nitrate_precs', 'bi_fe_ratio', 'separate_hydrolysis', '2_methoxyethanol', 'ethylene_glycol', 'acetic_acid', 'citric_acid', 'acetic_anhydride', 'acetylacetone', 'precursor_concentration', 'pH', 'stirring_time_hr', 'stirring_temp_degC', 'age_days', 'age_temp_degC', 'low_coating_time_sec', 'low_coating_rpm', 'high_coating_time_sec', 'high_coating_rpm', 'dry_time_min', 'dry_degC', 'layer_prebake_time_min', 'layer_prebake_degC', 'layer_annealing_time_min', 'layer_annealing_degC', 'final_prebake_time_min', 'final_prebake_degC', 'final_annealing_time_hr', 'final_annealing_degC', 'air_atm', 'o2_atm', 'n2_atm', 'thin_film_thickness_nm']


In [3]:
# Implement SMOTE (Sythetic Minority Oversampling Technique)
X = df.loc[:, df.columns != 'impurity_code']
y = df.loc[:, df.columns == 'impurity_code']

from imblearn.over_sampling import SMOTE

over_sample = SMOTE(random_state=512)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=512)
columns = X_train.columns

os_data_X, os_data_y = over_sample.fit_resample(X_train, y_train)

os_data_X = pd.DataFrame(data=os_data_X, columns=columns)

print('length of oversampled data: ', len(os_data_X))
print('number of pure syntheses in oversampled data: ', len(os_data_y[os_data_y['impurity_code']==0]))
print('number of impure syntheses in oversampled data: ', len(os_data_y[os_data_y['impurity_code']==1]))

length of oversampled data:  424
number of pure syntheses in oversampled data:  212
number of impure syntheses in oversampled data:  212


In [4]:
# Evaluation function

def evaluate(model, test_features, test_labels):
    test_labels = test_labels + 1
    predictions = model.predict(test_features) + 1
    errors = abs(predictions - (test_labels))
    
    #print(predictions)
    #print(test_labels)
    
    tp = np.count_nonzero(predictions + test_labels == 4)
    tn = np.count_nonzero(predictions + test_labels == 2)
    fp = np.count_nonzero(predictions - test_labels == 1)
    fn = np.count_nonzero(predictions - test_labels == -1)
    
    if tp or fp:
        precision = tp / (tp + fp)
    else:
        precision = 0
        
    if tp or fn:
        recall = tp / (tp + fn)
    else:
        recall = 0
        
    if precision or recall:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0
    
    mape = 100*np.mean(errors / (test_labels+1))
    accuracy = 100 - mape
    print('Model Performance')
    print('Average error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('\n')
    print("Precision: {:0.2f}%".format(precision))
    print("Recall: {:0.2f}%".format(recall))
    print("F1 score: {:0.2f}%".format(f1))
    
    return accuracy

In [5]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=512)
#print(train_features)
#print(train_labels)
tree = DecisionTreeClassifier(
    criterion="gini",
    splitter="best",
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=512,
    max_features=None,
)
tree.fit(train_features, train_labels)

#pprint(test_features)



# Visualize decision tree

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, class_names = ['pure', 'impure'], rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

importances = list(tree.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

print("Base model evaluation:")
print("####")
base_accuracy = evaluate(tree, test_features, test_labels)

Variable: precursor_concentration Importance: 0.16
Variable: bi_fe_ratio          Importance: 0.12
Variable: layer_annealing_degC Importance: 0.12
Variable: final_annealing_degC Importance: 0.11
Variable: stirring_temp_degC   Importance: 0.09
Variable: pH                   Importance: 0.08
Variable: thin_film_thickness_nm Importance: 0.06
Variable: n2_atm               Importance: 0.05
Variable: acetic_acid          Importance: 0.03
Variable: acetic_anhydride     Importance: 0.03
Variable: stirring_time_hr     Importance: 0.03
Variable: low_coating_rpm      Importance: 0.03
Variable: age_days             Importance: 0.02
Variable: layer_prebake_degC   Importance: 0.02
Variable: citric_acid          Importance: 0.01
Variable: dry_time_min         Importance: 0.01
Variable: dry_degC             Importance: 0.01
Variable: nitrate_precs        Importance: 0.0
Variable: separate_hydrolysis  Importance: 0.0
Variable: 2_methoxyethanol     Importance: 0.0
Variable: ethylene_glycol      Importa