In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, RandomizedSearchCV,GridSearchCV

import pandas as pd
import numpy as np
np.random.seed(74)
import os
import glob

In [3]:
# Get data
training_folder = os.path.join("..","..","ProcessedData","Training_Data_20231106")

csv_list = glob.glob(os.path.join(training_folder,"*"))
df_list = [pd.read_csv(csv,index_col=0) for csv in csv_list]

# Process data
df_total = pd.concat(df_list)
df_total.reset_index(drop=True)
df_total.replace([np.inf, -np.inf], np.nan, inplace=True)
df_total.dropna(axis=0,inplace=True)
df_total.drop(labels=["label","Filename","Region"],axis=1,inplace=True)

# Split
X = df_total.drop(labels="Labels",axis=1)
y = df_total["Labels"]
print(X.columns)

Index(['area', 'equivalent_diameter', 'orientation', 'major_axis_length',
       'minor_axis_length', 'perimeter', 'min_intensity', 'mean_intensity',
       'max_intensity', 'solidity', 'eccentricity', 'centroid_local-0',
       'centroid_local-1', 'feret_diameter_max', 'moments-0-0', 'moments-0-1',
       'moments-0-2', 'moments-0-3', 'moments-1-0', 'moments-1-1',
       'moments-1-2', 'moments-1-3', 'moments-2-0', 'moments-2-1',
       'moments-2-2', 'moments-2-3', 'moments-3-0', 'moments-3-1',
       'moments-3-2', 'moments-3-3', 'moments_central-0-0',
       'moments_central-0-1', 'moments_central-0-2', 'moments_central-0-3',
       'moments_central-1-0', 'moments_central-1-1', 'moments_central-1-2',
       'moments_central-1-3', 'moments_central-2-0', 'moments_central-2-1',
       'moments_central-2-2', 'moments_central-2-3', 'moments_central-3-0',
       'moments_central-3-1', 'moments_central-3-2', 'moments_central-3-3',
       'moments_hu-0', 'moments_hu-1', 'moments_hu-2', 'mo

In [4]:
# Remove size- and intensity-aware features
features_oi = [
                'solidity',
                'eccentricity',            
                'moments_hu-0',
                'moments_hu-1',
                'moments_hu-2',
                'moments_hu-3',
                'moments_hu-5',
                'major_axis_length/minor_axis_length',
                'perimeter/major_axis_length',
                'perimeter/minor_axis_length',
                'facet_score'
                ]
X = X[features_oi]

In [5]:
# Train-test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=.7,stratify=y)


In [6]:
# Create grid search parameters
params = {"n_estimators": np.arange(16,240,16),
          "min_samples_leaf":np.arange(1,15,2)}

seeds = np.arange(5)

train_f1_arr = []
test_f1_arr = []

best_grid = None
for ii,seed in enumerate(seeds):
    clf = RandomForestClassifier()
    grid_search = RandomizedSearchCV(clf,params,n_iter=10,random_state=seed)
    grid_search = GridSearchCV(clf,param_grid=params,scoring="f1_macro")

    result = grid_search.fit(X_train,y_train)
    best_clf = result.best_estimator_ 

    pred_train = best_clf.predict(X_train)
    pred_test = best_clf.predict(X_test)

    train_f1 = metrics.f1_score(pred_train,y_train,average="macro")
    test_f1 = metrics.f1_score(pred_test,y_test,average="macro")
    
    if best_grid is None or test_f1 > np.max(test_f1_arr):
        best_grid = grid_search

    train_f1_arr.append(train_f1)
    test_f1_arr.append(test_f1)



In [7]:
best_grid.cv_results_

{'mean_fit_time': array([0.01946182, 0.03996944, 0.05661993, 0.07469769, 0.09334903,
        0.11196585, 0.13255067, 0.14839807, 0.16848869, 0.18596044,
        0.20655904, 0.22292485, 0.24376998, 0.26112785, 0.01843743,
        0.03729811, 0.05360827, 0.07008996, 0.08737869, 0.10518398,
        0.12453289, 0.1397336 , 0.15729818, 0.17590985, 0.19180589,
        0.21140203, 0.22761064, 0.2455771 , 0.01781888, 0.03403125,
        0.05045156, 0.06728878, 0.08579116, 0.10119147, 0.11686931,
        0.13472509, 0.14970732, 0.16611462, 0.18391633, 0.20017095,
        0.21999164, 0.23195152, 0.01836562, 0.03353162, 0.04887776,
        0.06491551, 0.08051925, 0.09649925, 0.11428032, 0.12828741,
        0.14489479, 0.16140485, 0.17657747, 0.19401979, 0.20759902,
        0.23373122, 0.01710496, 0.03177791, 0.0482163 , 0.06347289,
        0.07981286, 0.09594502, 0.10994153, 0.12439713, 0.14212561,
        0.15681429, 0.17175927, 0.19527574, 0.20552521, 0.21968489,
        0.01616921, 0.0312212 ,

In [10]:
df_results = pd.DataFrame(best_grid.cv_results_)
df_results


unique_leafs = df_results.param_min_samples_leaf.unique()
import matplotlib.pyplot as plt 
cm = plt.get_cmap("viridis")
color_list = [cm(ii/len(unique_leafs)) for ii in range(len(unique_leafs))]
for ii,leaf_oi in enumerate(unique_leafs):
    c = color_list[ii]
    df_subset = df_results[df_results.param_min_samples_leaf==leaf_oi]
    plt.scatter

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.019462,0.000231,0.001786,0.000040,1,16,"{'min_samples_leaf': 1, 'n_estimators': 16}",0.664384,0.674958,0.692109,0.679134,0.704268,0.682970,0.013867,96
1,0.039969,0.001216,0.002602,0.000168,1,32,"{'min_samples_leaf': 1, 'n_estimators': 32}",0.701237,0.686255,0.659102,0.677782,0.738236,0.692522,0.026605,92
2,0.056620,0.000287,0.003151,0.000113,1,48,"{'min_samples_leaf': 1, 'n_estimators': 48}",0.695892,0.690539,0.675557,0.709275,0.743896,0.703032,0.023112,44
3,0.074698,0.000503,0.003732,0.000046,1,64,"{'min_samples_leaf': 1, 'n_estimators': 64}",0.696811,0.689379,0.624573,0.697900,0.709070,0.683547,0.030151,95
4,0.093349,0.000315,0.004452,0.000082,1,80,"{'min_samples_leaf': 1, 'n_estimators': 80}",0.701529,0.712111,0.667286,0.708173,0.742719,0.706364,0.024118,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0.148255,0.000608,0.006874,0.000047,13,160,"{'min_samples_leaf': 13, 'n_estimators': 160}",0.702103,0.704336,0.663958,0.682819,0.731228,0.696889,0.022558,80
94,0.164789,0.002248,0.007770,0.000326,13,176,"{'min_samples_leaf': 13, 'n_estimators': 176}",0.718643,0.714586,0.679808,0.687685,0.743325,0.708809,0.022842,8
95,0.179711,0.001712,0.008013,0.000042,13,192,"{'min_samples_leaf': 13, 'n_estimators': 192}",0.707439,0.693118,0.679935,0.676390,0.754798,0.702336,0.028419,53
96,0.194317,0.002828,0.008693,0.000153,13,208,"{'min_samples_leaf': 13, 'n_estimators': 208}",0.691824,0.698717,0.651683,0.692421,0.749076,0.696744,0.031035,83
