Notebook used to train RF models from the training data.

Comments throughout explain where changes can be made to change what's being trained!

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import pandas as pd 
import matplotlib.pyplot as plt 
import scipy.stats as stat
import numpy as np 
#import forestsci
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from collections import Counter

import sys
sys.path.append("..")
import Utility.model_utils as model_utils

In [2]:
# Runtime Variables
number_trees = 64
seed = 72
min_leaf = 3
accuracy_array = []

# Data Input
filename = '../Results/training_data.csv'

# If saving model, set save to True and give meaningful name
model_name = 'RF_I_P.sav'
save = False


# Control what features are used by the model
label="Labels"
features=['area',
#        'equivalent_diameter', 
#        'orientation', 
    'major_axis_length',
    'minor_axis_length',
    'perimeter',
#        'min_intensity',
#        'mean_intensity',
        'max_intensity',
    'solidity',
    'major_axis_length/minor_axis_length',
    'perimeter/major_axis_length',
    'perimeter/minor_axis_length',
          'feret_diameter_max',
      'moments_hu-0',
      'moments_hu-1',
      'moments_hu-2',
      'moments_hu-3',
     # 'moments_hu-4',
      'moments_hu-5',
      #'moments_hu-6',
#    'eccentricity'
         ]


In [9]:
best_f1 = 0
best_model = None
for seed in np.arange(100):
    # Read in data w/ features
    df = pd.read_csv(filename)

    # Modify Data (Remove illegal data OR set different runtime)

    
    # Change how data is grouped
    df['Labels'].replace('Poorly Segmented', 'Incomplete', inplace=True) # Group Poorly Segmented and Incomplete as a single class
    #df['Labels'].replace('Multiple Crystal', 'Crystal', inplace=True) # Group Multiple Crystal and Crystal as a single class
    df['Labels'].replace('Crystal', 'Incomplete', inplace=True)
    #df = df[df['Labels'] != "Incomplete"] # Remove any data labeled as "Incomplete"
    #df = df[df['Labels'] != "Crystal"]
    
    # Missed Labels: Purge any bad data that the model cannot run on
    df['Labels'].replace('', np.nan, inplace=True) 
    df.dropna(subset=['Labels'], inplace=True)
    # Numerical errors (divide by 0)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for feature in features:
        df.dropna(subset=[feature],inplace=True)

    # Split Data
    X=df[features]
    y=df[label]
    
    print(Counter(y))

    # Encode
    ohe = OneHotEncoder(sparse=False)
    y = ohe.fit_transform(df[[label]])

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

    # Create RandomForestRegressor
    rfr=RandomForestRegressor(n_estimators = number_trees,min_samples_leaf=min_leaf, oob_score=False)
    rfr.fit(X_train,y_train)

    # Accuracy Checks
    y_pred=rfr.predict(X_test)
    y_pred = np.round(y_pred)
    
    accuracy = metrics.f1_score(y_test,y_pred,average='macro')
    accuracy_array.append(accuracy)
    #accuracy = metrics.accuracy_score(y_test,y_pred)
    print(f'{seed} Seed Completed')
    model_utils.success_of_guess(y_pred,y_test,ohe)
    print(f'Running Average F1: {np.average(accuracy_array)}, OOB Score: N/A')
    
    if accuracy > best_f1:
        best_model = rfr
        best_f1 = accuracy

Counter({'Crystal': 754, 'Incomplete': 413})
0 Seed Completed
Labels_Crystal -> Precision = 0.8289473684210527, Recall = 0.8571428571428571, F1 = 0.842809364548495
Labels_Incomplete -> Precision = 0.7439024390243902, Recall = 0.7011494252873564, F1 = 0.7218934911242605
Macro F=0.7823514278363777
Run Accuracy : 0.7991452991452992
Running Average F1: 0.721634184609701, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
1 Seed Completed
Labels_Crystal -> Precision = 0.8471337579617835, Recall = 0.8417721518987342, F1 = 0.8444444444444446
Labels_Incomplete -> Precision = 0.6753246753246753, Recall = 0.6842105263157895, F1 = 0.6797385620915033
Macro F=0.7620915032679739
Run Accuracy : 0.7905982905982906
Running Average F1: 0.72178347360844, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
2 Seed Completed
Labels_Crystal -> Precision = 0.8131868131868132, Recall = 0.9135802469135802, F1 = 0.8604651162790699
Labels_Incomplete -> Precision = 0.7307692307692307, Recall = 0.5

22 Seed Completed
Labels_Crystal -> Precision = 0.8238993710691824, Recall = 0.8562091503267973, F1 = 0.8397435897435896
Labels_Incomplete -> Precision = 0.7066666666666667, Recall = 0.654320987654321, F1 = 0.6794871794871794
Macro F=0.7596153846153846
Run Accuracy : 0.7863247863247863
Running Average F1: 0.7234716251261237, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
23 Seed Completed
Labels_Crystal -> Precision = 0.7514450867052023, Recall = 0.8904109589041096, F1 = 0.8150470219435736
Labels_Incomplete -> Precision = 0.7377049180327869, Recall = 0.5113636363636364, F1 = 0.6040268456375839
Macro F=0.7095369337905788
Run Accuracy : 0.7478632478632479
Running Average F1: 0.7234240664526236, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
24 Seed Completed
Labels_Crystal -> Precision = 0.7564102564102564, Recall = 0.8251748251748252, F1 = 0.7892976588628763
Labels_Incomplete -> Precision = 0.6794871794871795, Recall = 0.5824175824175825, F1 = 0.627218934911242

44 Seed Completed
Labels_Crystal -> Precision = 0.8095238095238095, Recall = 0.8831168831168831, F1 = 0.84472049689441
Labels_Incomplete -> Precision = 0.7272727272727273, Recall = 0.6, F1 = 0.6575342465753425
Macro F=0.7511273717348763
Run Accuracy : 0.7863247863247863
Running Average F1: 0.7251198451849786, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
45 Seed Completed
Labels_Crystal -> Precision = 0.8387096774193549, Recall = 0.8280254777070064, F1 = 0.8333333333333334
Labels_Incomplete -> Precision = 0.6582278481012658, Recall = 0.6753246753246753, F1 = 0.6666666666666666
Macro F=0.75
Run Accuracy : 0.7777777777777778
Running Average F1: 0.725198829803439, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
46 Seed Completed
Labels_Crystal -> Precision = 0.8068181818181818, Recall = 0.8765432098765432, F1 = 0.8402366863905325
Labels_Incomplete -> Precision = 0.6551724137931034, Recall = 0.5277777777777778, F1 = 0.5846153846153845
Macro F=0.7124260355029586
Ru

66 Seed Completed
Labels_Crystal -> Precision = 0.8258064516129032, Recall = 0.8205128205128205, F1 = 0.8231511254019293
Labels_Incomplete -> Precision = 0.6455696202531646, Recall = 0.6538461538461539, F1 = 0.6496815286624203
Macro F=0.7364163270321749
Run Accuracy : 0.7649572649572649
Running Average F1: 0.7264308677642146, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
67 Seed Completed
Labels_Crystal -> Precision = 0.8092105263157895, Recall = 0.825503355704698, F1 = 0.8172757475083058
Labels_Incomplete -> Precision = 0.6829268292682927, Recall = 0.6588235294117647, F1 = 0.6706586826347305
Macro F=0.7439672150715182
Run Accuracy : 0.7649572649572649
Running Average F1: 0.7264829044031087, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
68 Seed Completed
Labels_Crystal -> Precision = 0.821656050955414, Recall = 0.8716216216216216, F1 = 0.8459016393442623
Labels_Incomplete -> Precision = 0.7532467532467533, Recall = 0.6744186046511628, F1 = 0.7116564417177915

88 Seed Completed
Labels_Crystal -> Precision = 0.8484848484848485, Recall = 0.8588957055214724, F1 = 0.853658536585366
Labels_Incomplete -> Precision = 0.6666666666666666, Recall = 0.647887323943662, F1 = 0.6571428571428573
Macro F=0.7554006968641116
Run Accuracy : 0.7948717948717948
Running Average F1: 0.7278378371201103, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
89 Seed Completed
Labels_Crystal -> Precision = 0.7777777777777778, Recall = 0.84, F1 = 0.8076923076923077
Labels_Incomplete -> Precision = 0.6666666666666666, Recall = 0.5714285714285714, F1 = 0.6153846153846153
Macro F=0.7115384615384615
Run Accuracy : 0.7435897435897436
Running Average F1: 0.7277924349597157, OOB Score: N/A
Counter({'Crystal': 754, 'Incomplete': 413})
90 Seed Completed
Labels_Crystal -> Precision = 0.7911392405063291, Recall = 0.9057971014492754, F1 = 0.8445945945945946
Labels_Incomplete -> Precision = 0.8289473684210527, Recall = 0.65625, F1 = 0.7325581395348838
Macro F=0.78857636706473

In [8]:
print(f'Best Model F1: {best_f1}')
#print(best_model.oob_score_)
print(best_model.score(X_test,y_test))
print(ohe)
print(ohe.feature_names_in_)
ohe.get_feature_names_out(['Labels'])

Best Model F1: 0.7384650262641252
0.7298596621468477
OneHotEncoder(sparse=False)
['Labels']


array(['Labels_Crystal', 'Labels_Incomplete', 'Labels_Multiple Crystal'],
      dtype=object)

In [None]:
# Feature Importances

from sklearn.inspection import permutation_importance
import time

start_time = time.time()
result = permutation_importance(best_model, X_test, y_test, n_repeats=20, random_state=seed, n_jobs=4)
elapsed_time = time.time() - start_time

print(f'Time elapsed: {elapsed_time}')
forest_importances = pd.Series(result.importances_mean, index=features)

In [None]:
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
import pickle
if save:
    pickle.dump(best_model, open(model_name, 'wb'))