Notebook used to train RF models from the training data.

Comments throughout explain where changes can be made to change what's being trained!

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import matplotlib.pyplot as plt 
import scipy.stats as stat
import numpy as np 
#import forestsci
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from collections import Counter

import sys
sys.path.append("..")
import Utility.model_utils as model_utils

In [2]:
# Runtime Variables
number_trees = 64
seed = 72
min_leaf = 3
accuracy_array = []

# Data Input
filename = '../Results/training_data.csv'

# If saving model, set save to True and give meaningful name
model_name = 'RF_C-MC_I-P.sav'
save = True


# Control what features are used by the model
label="Labels"
features=['area',
#        'equivalent_diameter', 
#        'orientation', 
    'major_axis_length',
    'minor_axis_length',
    'perimeter',
#        'min_intensity',
#        'mean_intensity',
        'max_intensity',
    'solidity',
    'major_axis_length/minor_axis_length',
    'perimeter/major_axis_length',
    'perimeter/minor_axis_length',
          'feret_diameter_max',
      'moments_hu-0',
      'moments_hu-1',
      'moments_hu-2',
      'moments_hu-3',
     # 'moments_hu-4',
      'moments_hu-5',
      #'moments_hu-6',
#    'eccentricity'
         ]


In [26]:
best_f1 = 0
best_model = None
for seed in np.arange(100):
    # Read in data w/ features
    df = pd.read_csv(filename)

    # Modify Data (Remove illegal data OR set different runtime)

    
    # Change how data is grouped
    df['Labels'].replace('Poorly Segmented', 'Incomplete', inplace=True) # Group Poorly Segmented and Incomplete as a single class
    df['Labels'].replace('Multiple Crystal', 'Crystal', inplace=True) # Group Multiple Crystal and Crystal as a single class
    #df['Labels'].replace('Crystal', 'Incomplete', inplace=True)
    #df = df[df['Labels'] != "Incomplete"] # Remove any data labeled as "Incomplete"
    #df = df[df['Labels'] != "Crystal"]
    
    # Missed Labels: Purge any bad data that the model cannot run on
    df['Labels'].replace('', np.nan, inplace=True) 
    df.dropna(subset=['Labels'], inplace=True)
    # Numerical errors (divide by 0)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for feature in features:
        df.dropna(subset=[feature],inplace=True)

    # Split Data
    X=df[features]
    y=df[label]
    
    print(Counter(y))

    # Encode
    ohe = OneHotEncoder(sparse=False)
    y = ohe.fit_transform(df[[label]])

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

    # Create RandomForestRegressor
    rfr=RandomForestRegressor(n_estimators = number_trees,min_samples_leaf=min_leaf, oob_score=False)
    rfr.fit(X_train,y_train)

    # Accuracy Checks
    y_pred=rfr.predict(X_test)
    y_pred = np.round(y_pred)
    
    accuracy = metrics.f1_score(y_test,y_pred,average='macro')
    accuracy_array.append(accuracy)
    #accuracy = metrics.accuracy_score(y_test,y_pred)
    print(f'{seed} Seed Completed')
    model_utils.success_of_guess(y_pred,y_test,ohe)
    print(f'Running Average F1: {np.average(accuracy_array)}, OOB Score: N/A')
    
    if accuracy > best_f1:
        best_model = rfr
        best_f1 = accuracy

Counter({'Incomplete': 210, 'Poorly Segmented': 203})
0 Seed Completed
Labels_Incomplete -> Precision = 0.7692307692307693, Recall = 0.6382978723404256, F1 = 0.6976744186046512
Labels_Poorly Segmented -> Precision = 0.6136363636363636, Recall = 0.75, F1 = 0.6749999999999999
Macro F=0.6863372093023256
Run Accuracy : 0.6867469879518072
Running Average F1: 0.7523644356933465, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
1 Seed Completed
Labels_Incomplete -> Precision = 0.7435897435897436, Recall = 0.5918367346938775, F1 = 0.6590909090909091
Labels_Poorly Segmented -> Precision = 0.5454545454545454, Recall = 0.7058823529411765, F1 = 0.6153846153846153
Macro F=0.6372377622377622
Run Accuracy : 0.6385542168674698
Running Average F1: 0.7522151144566985, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
2 Seed Completed
Labels_Incomplete -> Precision = 0.78125, Recall = 0.5319148936170213, F1 = 0.6329113924050632
Labels_Poorly Segmented -> Precision =

21 Seed Completed
Labels_Incomplete -> Precision = 0.6857142857142857, Recall = 0.6153846153846154, F1 = 0.6486486486486486
Labels_Poorly Segmented -> Precision = 0.6875, Recall = 0.75, F1 = 0.717391304347826
Macro F=0.6830199764982373
Run Accuracy : 0.6867469879518072
Running Average F1: 0.7498565172626906, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
22 Seed Completed
Labels_Incomplete -> Precision = 0.6666666666666666, Recall = 0.6666666666666666, F1 = 0.6666666666666666
Labels_Poorly Segmented -> Precision = 0.7446808510638298, Recall = 0.7446808510638298, F1 = 0.7446808510638298
Macro F=0.7056737588652482
Run Accuracy : 0.7108433734939759
Running Average F1: 0.7498007309515828, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
23 Seed Completed
Labels_Incomplete -> Precision = 0.7058823529411765, Recall = 0.6, F1 = 0.6486486486486486
Labels_Poorly Segmented -> Precision = 0.673469387755102, Recall = 0.7674418604651163, F1 = 0.717391304347

43 Seed Completed
Labels_Incomplete -> Precision = 0.6470588235294118, Recall = 0.5789473684210527, F1 = 0.6111111111111113
Labels_Poorly Segmented -> Precision = 0.673469387755102, Recall = 0.7333333333333333, F1 = 0.702127659574468
Macro F=0.6566193853427896
Run Accuracy : 0.6626506024096386
Running Average F1: 0.7475931515402675, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
44 Seed Completed
Labels_Incomplete -> Precision = 0.5757575757575758, Recall = 0.5277777777777778, F1 = 0.5507246376811594
Labels_Poorly Segmented -> Precision = 0.66, Recall = 0.7021276595744681, F1 = 0.6804123711340208
Macro F=0.61556850440759
Run Accuracy : 0.6265060240963856
Running Average F1: 0.7474309590990725, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
45 Seed Completed
Labels_Incomplete -> Precision = 0.7073170731707317, Recall = 0.6590909090909091, F1 = 0.6823529411764706
Labels_Poorly Segmented -> Precision = 0.6428571428571429, Recall = 0.692307692307

65 Seed Completed
Labels_Incomplete -> Precision = 0.6666666666666666, Recall = 0.7555555555555555, F1 = 0.7083333333333334
Labels_Poorly Segmented -> Precision = 0.65625, Recall = 0.5526315789473685, F1 = 0.6
Macro F=0.6541666666666667
Run Accuracy : 0.6626506024096386
Running Average F1: 0.7447244206074022, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
66 Seed Completed
Labels_Incomplete -> Precision = 0.717948717948718, Recall = 0.6363636363636364, F1 = 0.674698795180723
Labels_Poorly Segmented -> Precision = 0.6363636363636364, Recall = 0.717948717948718, F1 = 0.674698795180723
Macro F=0.674698795180723
Run Accuracy : 0.6746987951807228
Running Average F1: 0.7446406578975617, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
67 Seed Completed
Labels_Incomplete -> Precision = 0.6388888888888888, Recall = 0.6216216216216216, F1 = 0.6301369863013699
Labels_Poorly Segmented -> Precision = 0.7021276595744681, Recall = 0.717391304347826, F1 = 0.7

87 Seed Completed
Labels_Incomplete -> Precision = 0.6756756756756757, Recall = 0.5555555555555556, F1 = 0.6097560975609757
Labels_Poorly Segmented -> Precision = 0.5652173913043478, Recall = 0.6842105263157895, F1 = 0.6190476190476191
Macro F=0.6144018583042974
Run Accuracy : 0.6144578313253012
Running Average F1: 0.741740845847502, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
88 Seed Completed
Labels_Incomplete -> Precision = 0.5641025641025641, Recall = 0.5789473684210527, F1 = 0.5714285714285715
Labels_Poorly Segmented -> Precision = 0.6363636363636364, Recall = 0.6222222222222222, F1 = 0.6292134831460675
Macro F=0.6003210272873195
Run Accuracy : 0.6024096385542169
Running Average F1: 0.741576020884145, OOB Score: N/A
Counter({'Incomplete': 210, 'Poorly Segmented': 203})
89 Seed Completed
Labels_Incomplete -> Precision = 0.7058823529411765, Recall = 0.5454545454545454, F1 = 0.6153846153846153
Labels_Poorly Segmented -> Precision = 0.5918367346938775, Recall 

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'Best Model F1: {best_f1}')
#print(best_model.oob_score_)
y_pred_best = np.round(best_model.predict(X_test))
print(f'R2: {best_model.score(X_test,y_test)}')
print(f'Accuracy: {accuracy_score(y_test,y_pred_best)}')
print(ohe.feature_names_in_)
count_fails = np.abs(y_pred_best-y_test)
ohe.get_feature_names_out(['Labels'])
print(np.sum(count_fails)/2,np.shape(count_fails)[0])

Best Model F1: 0.7709513435003632
R2: 0.6587471625528327
Accuracy: 0.9397590361445783
['Labels']
5.0 83


In [None]:
# Feature Importances

from sklearn.inspection import permutation_importance
import time

start_time = time.time()
result = permutation_importance(best_model, X_test, y_test, n_repeats=20, random_state=seed, n_jobs=4)
elapsed_time = time.time() - start_time

print(f'Time elapsed: {elapsed_time}')
forest_importances = pd.Series(result.importances_mean, index=features)

In [None]:
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
import pickle
if save:
    pickle.dump(best_model, open(model_name, 'wb'))