In [56]:
# imports
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [57]:
### Load data
# extract data from file
excel_file = "../Data/Cell_Data.xlsx"
sheets_dict = pd.read_excel(excel_file, sheet_name = None) # dict where each value is a data frame (sheet)

# cell type labels are sheet labels in the excel document and keys in the df dict
# cell_types = ['L1', 'L2', 'L3', 'Monoblasts', 'Myeloblasts', 'Reactive Lymphs']
cell_types = list(sheets_dict.keys())

for cell_type in cell_types:
    # add a cell type column to each data frame
    sheets_dict[cell_type]["cell_type"] = cell_type

# build one singular df with a class column identifying cell_type
df_list = [sheets_dict[cell_type] for cell_type in cell_types]
combined_df = pd.concat(df_list, ignore_index=True)

# throw out name of image, area, and total image area
combined_df.drop(["Image", "Area (microns^2)", "TotalImageArea"], axis=1, inplace=True)

In [58]:
display(combined_df)

Unnamed: 0,Lacunarity,Total Length (microns),Endpoints,HGU (microns),Branchpoints,Box-Counting Fractal Dimension,Curvature_50.0,% High Density Matrix,Alignment,Branchpoints/Total Length,Endpoints/Total Length,Average Fiber Length,Average Fiber Thickness,cell_type
0,79.211,4005,69,58.043,367,1.175,26.812,0.587,0.11620,0.091635,0.017228,18.371560,14.656679,L1
1,110.888,3485,90,38.722,227,1.112,36.987,0.558,0.06317,0.065136,0.025825,21.987382,16.011478,L1
2,173.399,1880,43,43.721,59,1.024,30.577,0.542,0.09171,0.031383,0.022872,36.862745,28.829787,L1
3,96.193,3203,59,54.288,161,1.115,24.991,0.622,0.03657,0.050265,0.018420,29.118182,19.419294,L1
4,132.483,2414,54,44.704,82,1.088,26.482,0.575,0.05596,0.033969,0.022370,35.500000,23.819387,L1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,86.239,3993,99,40.333,80,1.110,34.130,0.655,0.03727,0.020035,0.024793,44.614525,16.403706,Reactive Lymphs
2596,124.097,2777,61,45.525,88,1.097,39.728,0.552,0.10080,0.031689,0.021966,37.275168,19.877566,Reactive Lymphs
2597,76.126,3913,65,60.200,126,1.148,33.045,0.745,0.07569,0.032200,0.016611,40.973822,19.039100,Reactive Lymphs
2598,75.398,4100,58,70.690,138,1.170,27.441,0.739,0.03438,0.033659,0.014146,41.836735,18.024390,Reactive Lymphs


In [59]:
# create pipeline
# first, scale the data with a standard scaler
# then, reduce dimensionality with PCA (number of dimensions tbd in model selection)
# finally, use either random forest or knn to classify (also tbd)
# help from: https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py

# pass the selection of the classification algorithm through to validation
pipe = Pipeline(
    [
        ("scaling", StandardScaler()),
        ("reduce_dim", PCA()),
        ("classify", "passthrough")
    ]
)

In [60]:
# define grid search parameters for model tuning and model selection
# generic
N_FEATURES_OPTIONS = [2, 4, 8, None]      # number of components to keep from PCA
# Random Forest
n_estimators_OPTIONS = [10, 30, 100, 300]
max_depth_OPTIONS = [10, 25, 50, None]
max_features_OPTIONS = ["sqrt", "log2", None]
bootstrap_OPTIONS = [True, False]
min_samples_split_OPTIONS = [2, 3, 4]

# K Neighbors
p_OPTIONS = [1, 1.25, 1.5, 1.75, 2, 2.25, 2.5]
k_OPTIONS = [1, 2, 3, 5, 7, 11, 17]

# define the parameter grid
param_grid = [
    {
        "reduce_dim__n_components": N_FEATURES_OPTIONS,
        "classify": [RandomForestClassifier()],
        "classify__n_estimators": n_estimators_OPTIONS,
        "classify__max_depth": max_depth_OPTIONS,
        "classify__max_features": max_features_OPTIONS,
        "classify__bootstrap": bootstrap_OPTIONS,
        "classify__min_samples_split": min_samples_split_OPTIONS
    },
    {
        "reduce_dim__n_components": N_FEATURES_OPTIONS,
        "classify": [KNeighborsClassifier()],
        "classify__p": p_OPTIONS,
        "classify__n_neighbors": k_OPTIONS,
    },
]


In [None]:
# set up grid search on the pipeline using Stratified 5-fold Cross Validation
# stratified and 5-fold by default
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid = GridSearchCV(pipe, n_jobs=-1, param_grid=param_grid, scoring={'accuracy': 'accuracy', 'f1': 'f1_macro'}, refit='accuracy', verbose=1)

In [62]:
# split the data stratified by cell type 70-30 for training-testing
y_data = combined_df["cell_type"]
data_train, data_test = train_test_split(combined_df, test_size = 0.3, random_state=1234, shuffle=True, stratify=y_data)

y_train = data_train["cell_type"]
X_train = data_train.drop("cell_type", axis=1)
y_test = data_test["cell_type"]
X_test = data_test.drop("cell_type", axis=1)

display(y_train, X_train, y_test, X_test)

2549    Reactive Lymphs
762                  L2
2174        Myeloblasts
1440         Monoblasts
1810        Myeloblasts
             ...       
2289    Reactive Lymphs
2410    Reactive Lymphs
361                  L1
2107        Myeloblasts
2564    Reactive Lymphs
Name: cell_type, Length: 1820, dtype: object

Unnamed: 0,Lacunarity,Total Length (microns),Endpoints,HGU (microns),Branchpoints,Box-Counting Fractal Dimension,Curvature_50.0,% High Density Matrix,Alignment,Branchpoints/Total Length,Endpoints/Total Length,Average Fiber Length,Average Fiber Thickness
2549,94.993,3617,53,68.245,172,1.148,39.380,0.681,0.01811,0.047553,0.014653,32.151111,18.827758
762,99.346,4536,100,45.360,223,1.156,35.556,0.576,0.10290,0.049162,0.022046,28.086687,12.698413
2174,103.734,3538,67,52.806,131,1.126,42.563,0.752,0.05626,0.037027,0.018937,35.737374,21.254946
1440,91.428,4506,42,107.286,228,1.157,31.236,0.643,0.03481,0.050599,0.009321,33.377778,14.269862
1810,104.736,3200,83,38.554,196,1.142,44.777,0.586,0.11780,0.061250,0.025937,22.939068,18.312500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289,143.248,3796,59,64.339,239,1.152,27.090,0.387,0.18420,0.062961,0.015543,25.476510,10.194942
2410,90.005,3887,69,56.333,251,1.166,30.064,0.528,0.02817,0.064574,0.017751,24.293750,13.583741
361,116.914,2389,35,68.257,119,1.089,38.534,0.727,0.09762,0.049812,0.014650,31.025974,30.431143
2107,101.988,3431,85,40.365,136,1.096,31.066,0.686,0.06817,0.039639,0.024774,31.049774,19.994171


891                  L2
2055        Myeloblasts
1234                 L3
1268                 L3
1210                 L3
             ...       
1421         Monoblasts
292                  L1
2525    Reactive Lymphs
966                  L2
628                  L2
Name: cell_type, Length: 780, dtype: object

Unnamed: 0,Lacunarity,Total Length (microns),Endpoints,HGU (microns),Branchpoints,Box-Counting Fractal Dimension,Curvature_50.0,% High Density Matrix,Alignment,Branchpoints/Total Length,Endpoints/Total Length,Average Fiber Length,Average Fiber Thickness
891,84.008,3952,72,54.889,340,1.189,33.516,0.605,0.10200,0.086032,0.018219,19.184466,15.308704
2055,87.029,2997,49,61.163,130,1.143,43.000,0.814,0.07238,0.043377,0.016350,33.486034,27.160494
1234,89.435,6581,177,37.181,244,1.180,28.778,0.688,0.01024,0.037076,0.026896,31.263658,10.454338
1268,99.671,4680,76,61.579,327,1.159,37.176,0.621,0.07927,0.069872,0.016239,23.225806,13.269231
1210,100.400,5680,122,46.557,384,1.198,34.651,0.639,0.06617,0.067606,0.021479,22.450593,11.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421,83.161,5229,76,68.803,341,1.183,23.420,0.639,0.16670,0.065213,0.014534,25.079137,12.220310
292,77.085,3099,29,106.862,98,1.106,27.472,0.791,0.09664,0.031623,0.009358,48.803150,25.524363
2525,112.364,2522,32,78.812,75,1.095,21.509,0.647,0.11430,0.029738,0.012688,47.140187,25.654243
966,99.888,2950,50,59.000,125,1.115,46.868,0.658,0.06313,0.042373,0.016949,33.714286,22.305085


In [None]:
# now actually run the code
# consider cacheing if it is too big

# Fit the GridSearchCV object
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1348 candidates, totalling 6740 fits
[CV] END classify=RandomForestClassifier(), classify__bootstrap=True, classify__max_depth=10, classify__max_features=sqrt, classify__min_samples_split=2, classify__n_estimators=10, reduce_dim__n_components=2; total time=   0.1s
[CV] END classify=RandomForestClassifier(), classify__bootstrap=True, classify__max_depth=10, classify__max_features=sqrt, classify__min_samples_split=2, classify__n_estimators=10, reduce_dim__n_components=2; total time=   0.0s
[CV] END classify=RandomForestClassifier(), classify__bootstrap=True, classify__max_depth=10, classify__max_features=sqrt, classify__min_samples_split=2, classify__n_estimators=10, reduce_dim__n_components=2; total time=   0.0s
[CV] END classify=RandomForestClassifier(), classify__bootstrap=True, classify__max_depth=10, classify__max_features=sqrt, classify__min_samples_split=2, classify__n_estimators=10, reduce_dim__n_components=2; total time=   0.0s
[CV] END classify=Rando