In [1]:
### imports
import numpy as np
import pandas as pd
from warnings import filterwarnings
from tqdm import tqdm
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, make_scorer
from sklearn.inspection import permutation_importance

In [2]:
### filter warnings from numpy that show up occasionally during cv, should not affect performance
filterwarnings("ignore", message="invalid value encountered in cast", category=RuntimeWarning)

In [3]:
### Load data
# extract data from file
excel_file = "../Data/Cell_Data.xlsx"
sheets_dict = pd.read_excel(excel_file, sheet_name = None) # dict where each value is a data frame (sheet)

# cell type labels are sheet labels in the excel document and keys in the df dict
# cell_types = ['L1', 'L2', 'L3', 'Monoblasts', 'Myeloblasts', 'Reactive Lymphs']
cell_types = list(sheets_dict.keys())

for cell_type in cell_types:
    # add a cell type column to each data frame
    sheets_dict[cell_type]["cell_type"] = cell_type

# build one singular df with a class column identifying cell_type
df_list = [sheets_dict[cell_type] for cell_type in cell_types]
combined_df = pd.concat(df_list, ignore_index=True)

# throw out name of image, area, and total image area
combined_df.drop(["Image", "Area (microns^2)", "TotalImageArea"], axis=1, inplace=True)

In [4]:
### summarize data
# show dataframe
display(combined_df)
# show unique cell types
print("Cell types:", set(combined_df["cell_type"]))

Unnamed: 0,Lacunarity,Total Length (microns),Endpoints,HGU (microns),Branchpoints,Box-Counting Fractal Dimension,Curvature_50.0,% High Density Matrix,Alignment,Branchpoints/Total Length,Endpoints/Total Length,Average Fiber Length,Average Fiber Thickness,cell_type
0,79.211,4005,69,58.043,367,1.175,26.812,0.587,0.11620,0.091635,0.017228,18.371560,14.656679,L1
1,110.888,3485,90,38.722,227,1.112,36.987,0.558,0.06317,0.065136,0.025825,21.987382,16.011478,L1
2,173.399,1880,43,43.721,59,1.024,30.577,0.542,0.09171,0.031383,0.022872,36.862745,28.829787,L1
3,96.193,3203,59,54.288,161,1.115,24.991,0.622,0.03657,0.050265,0.018420,29.118182,19.419294,L1
4,132.483,2414,54,44.704,82,1.088,26.482,0.575,0.05596,0.033969,0.022370,35.500000,23.819387,L1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,86.239,3993,99,40.333,80,1.110,34.130,0.655,0.03727,0.020035,0.024793,44.614525,16.403706,Reactive Lymphs
2596,124.097,2777,61,45.525,88,1.097,39.728,0.552,0.10080,0.031689,0.021966,37.275168,19.877566,Reactive Lymphs
2597,76.126,3913,65,60.200,126,1.148,33.045,0.745,0.07569,0.032200,0.016611,40.973822,19.039100,Reactive Lymphs
2598,75.398,4100,58,70.690,138,1.170,27.441,0.739,0.03438,0.033659,0.014146,41.836735,18.024390,Reactive Lymphs


Cell types: {'Reactive Lymphs', 'L2', 'L3', 'Monoblasts', 'L1', 'Myeloblasts'}


In [15]:
### Construct the data permutations
# now make different combinations of the data
# L1 and RL
l1_and_rl_df = combined_df[combined_df["cell_type"].isin(["L1", "Reactive Lymphs"])]
# L2 and RL
l2_and_rl_df = combined_df[combined_df["cell_type"].isin(["L2", "Reactive Lymphs"])]
# L3 and RL
l3_and_rl_df = combined_df[combined_df["cell_type"].isin(["L3", "Reactive Lymphs"])]
# L1, L2, RL
l1_and_l2_and_rl_df = combined_df[combined_df["cell_type"].isin(["L1", "L2", "Reactive Lymphs"])]
# Monoblast and RL
monoblast_and_rl_df = combined_df[combined_df["cell_type"].isin(["Monoblasts", "Reactive Lymphs"])]
# Myeloblast and RL
myeloblast_and_rl_df = combined_df[combined_df["cell_type"].isin(["Myeloblasts", "Reactive Lymphs"])]
# Combined blasts and RL
combined_blasts_and_rl_df = combined_df.copy()
combined_blasts_and_rl_df.loc[combined_df["cell_type"] != "Reactive Lymphs", "cell_type"] = "Blasts"

# save dataframes in a dictionary with keys as their labels
list_of_df = [combined_df, l1_and_rl_df, l2_and_rl_df, l3_and_rl_df, l1_and_l2_and_rl_df, monoblast_and_rl_df, myeloblast_and_rl_df, combined_blasts_and_rl_df]
list_of_df_names = ["All Cells", "L1 and RL", "L2 and RL", "L3 and RL", "L1 and L2 and RL", "Monoblast and RL", "Myeloblast and RL", "All blasts and RL"]
dict_of_df = dict(zip(list_of_df_names, list_of_df))

In [16]:
### Define pipeline
# first, scale the data with a standard scaler
# then, reduce dimensionality with PCA (number of dimensions tbd in model selection)
# finally, use either random forest or knn to classify (also tbd)
# help from: https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py

# pass the selection of the classification algorithm through to validation
pipe = Pipeline(
    [
        ("scaling", StandardScaler()),
        ("reduce_dim", PCA()),
        ("classify", "passthrough")
    ]
)

In [17]:
### Define the parameter space to use during validation (model selection and hyperparameter optimization)
param_dist = [
    {   # Random Forest                                                             # options for...
        "classify":                         [RandomForestClassifier()],             # model being trained (Random Forest)                                                             # options for...
        "reduce_dim__n_components":         [2, 4, 8, None],                        # number of components to keep from PCA
        "classify__n_estimators":           [30, 100, 300],                         # number of decision trees in the random forest
        "classify__max_depth":              [10, 25, 50, None],                     # maximum allowed depth of the decision trees
        "classify__max_features":           ["sqrt", "log2", None],                 # restriction on number of features considered per split
        "classify__bootstrap":              [True, False],                          # whether to bootstrap
        "classify__min_samples_split":      [2, 3, 4]                               # minimum number of samples required to split a node
    },
    {   # K nearest neighbors                                                       # options for...
        "classify":                         [KNeighborsClassifier()],               # model being trained (K nearest neighbors)
        "reduce_dim__n_components":         [2, 4, 8, None],                        # number of components to keep from PCA
        "classify__weights":                ["uniform", "distance"],                # vote weighting method
        "classify__p":                      [1, 1.25, 1.5, 1.75, 2, 2.25, 2.5],     # exponent parameter for minkowski distance
        "classify__n_neighbors":            [3, 5, 7, 11, 15],                      # number of neighbors considered
    }
]

In [5]:
### Define custom model evaluation metrics
def custom_success_metrics(y_test, y_pred, type):
    # find confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # for each class, evaluate true positives, true negatives, etc (vectorized)
    # Chat GPT helped here
    tps = np.diag(cm)
    fps = np.sum(cm, axis=0) - tps
    fns = np.sum(cm, axis=1) - tps
    tns = np.sum(cm) - (fps + fns + tps)
    # each of these should be a vector with index corresponding to class
    # now return a custom dictionary of balanced (macro-averaged) versions of accuracy, precision, sensitivity, specificity
    # treat nan values as a score of 0, most likely are caused by empty classes
    # also silence warnings caused by these issues during cv
    # ChatGPT helped generate code to sanitize the final value and suppress warnings
    val = None
    with np.errstate(divide='ignore', invalid='ignore'):            
        match type:
            case "accuracy": val = np.nanmean((tps + tns) / (tps + tns + fns + fps))
            case "precision": val = np.nanmean(tps / (tps + fps))
            case "sensitivity": val = np.nanmean(tps / (tps + fns))
            case "specificity": val = np.nanmean(tns / (tns + fps))
            case _: raise ValueError(f"Type \"{type}\" is not supported")
    return 0.0 if np.isnan(val) else val

# metrics to be calculated on each proposed model during cross validation (for model selection and hyperparameter optimization)
metrics_dict = {
    "accuracy": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "accuracy")),
    "precision": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "precision")),
    "sensitivity": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "sensitivity")),
    "specificity": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "specificity")),
    # use ovo and ovr for comparison
    "auc_roc_ovo": make_scorer(roc_auc_score, multi_class = "ovo", response_method=["decision_function", "predict_proba"]),
    "auc_roc_ovr": make_scorer(roc_auc_score, multi_class = "ovr", response_method=["decision_function", "predict_proba"]),
    "f1": "f1_macro"
}

# Also define by class for use in final testing
def custom_success_metrics_by_class(y_test, y_pred, type):
    # find confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # for each class, evaluate true positives, true negatives, etc (vectorized)
    # Chat GPT helped to get these straight
    tps = np.diag(cm)
    fps = np.sum(cm, axis=0) - tps
    fns = np.sum(cm, axis=1) - tps
    tns = np.sum(cm) - (fps + fns + tps)
    # each of these should be a vector with index corresponding to class
    # now compute the vector for either accuracy, precision, sensitivity or specificity
    val = None
    match type:
        case "accuracy": val = (tps + tns) / (tps + tns + fns + fps)
        case "precision": val = tps / (tps + fps)
        case "sensitivity": val = tps / (tps + fns)
        case "specificity": val = tns / (tns + fps)
        case _: raise ValueError(f"Type \"{type}\" is not supported")
    return val

In [6]:
### Define replicable train-test split
def ttsplit(df):
    # split the data stratified by cell type 70-30 for training-testing
    # use a fixed random state so that each split can be replicated exactly
    # our response variable is cell type
    y_data = df["cell_type"]
    X_data = df.drop("cell_type", axis=1, inplace=False)
    # split our data points into 70-30 training and testing sets, stratified by cell type
    # returns X_train, X_test, y_train, y_test
    return train_test_split(X_data, y_data, test_size = 0.3, random_state=1234, shuffle=True, stratify=y_data)

In [None]:
### How to unpickle models
# # (do this if you don't want to train models yourself)
# with open("models_original.pickle", "rb") as file:
#     result_dicts = pickle.load(file)
# # Check to make sure the filename is the right one.
# # Comment out the cell below and run all after uncommenting this cell

In [20]:
### Model validation and training
# Perform random search for model selection and hyperparameter optimization
# Run random search on the pipeline for each permutation of the dataset stored in dict_of_df
# Save results for each model in a dictionary
result_dicts = dict()
# fit each of the data and test on the testing data
pbar = tqdm(dict_of_df.items())
for name, df in pbar:
    # Identify which permutation is being run
    pbar.set_description(f"Randomized Search CV on {name} data")
    # record results for this specific permutation in results_dict
    results_dict = dict()
    # split the data (split can be replicated later, as a fixed random seed is used in ttsplit)
    X_train, X_test, y_train, y_test = ttsplit(df)
    # set up a randomized search on the pipeline using stratified 5-fold cross validation
    # (default for training classifiers, but specifying here to be explicit and provide replicable random state)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    search = RandomizedSearchCV(pipe, n_iter=400, n_jobs=-2, param_distributions=param_dist, scoring=metrics_dict, refit='accuracy', cv=cv, verbose=False, error_score='raise')
    # cross validate and then fit the data on the best estimator in terms of average cv-determined accuracy
    search.fit(X_train, y_train)
    # save the results, including the best estimator
    results = pd.DataFrame(search.cv_results_)
    results.sort_values(by="rank_test_accuracy", inplace=True)
    results_dict['results'] = results
    results_dict['best_params'] = search.best_params_
    results_dict['best_score'] = search.best_score_
    results_dict['best_estimator'] = search.best_estimator_
    results_dict['train_set_X'] = X_train
    results_dict['train_set_y'] = y_train
    results_dict['test_set_X'] = X_test
    results_dict['test_set_y'] = y_test
    # save this dictionary of results for the specific classification job
    result_dicts[name] = results_dict
# save results of the random search and cv in a spreadsheet
# write these dataframes to an excel sheet
with pd.ExcelWriter("validation.xlsx", engine='openpyxl') as writer:
    for name, results in result_dicts.items():
        results['results'].to_excel(writer, sheet_name=name, index=False)
# also pickle the result_dicts to get them later if needed
with open("models.pickle", "wb") as file:
    pickle.dump(result_dicts, file)

  0%|          | 0/8 [00:00<?, ?it/s]

Randomized Search CV on All blasts and RL data: 100%|██████████| 8/8 [43:57<00:00, 329.66s/it]


In [None]:
### Model testing
# Test best performing models and report output in human readable format
# Iterate through each permutation of the data (all cells, L1 vs RL, etc)
# Save information in results_<date>.txt
# metrics dict for feature importance
metrics_dict_fi = {
    "accuracy": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "accuracy")),
    "precision": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "precision")),
    "sensitivity": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "sensitivity")),
    "specificity": make_scorer(lambda ytest, ypred: custom_success_metrics(ytest, ypred, "specificity")),
}
with open("results.txt", "w") as file:
    file.write("This is the text to write to the file.\n")
    for name, results_dict in result_dicts.items():
        # For each model, test and record results:
        # Confusion matrix
        # Performance metrics
        # Feature importance
        # Model hyperparameters

        # TEST MODEL
        # get the best model found for the permutation given judging by accuracy
        # these were manually confirmed in model validation to have acceptable values for the other evaluation metrics during cross validation
        best_estimator = results_dict['best_estimator']
        # identify whether this is a multi-class situation
        is_multi_class = True if len(best_estimator.classes_) > 2 else False
        # recreate the train and test split (ttsplit uses random seed so the split will be identical to split used in training)
        # _, X_test, _, y_test = ttsplit(dict_of_df[name])
        # we can also use the stored attributes (useful if working from pickled models)
        X_train = results_dict['test_set_X']
        y_train = results_dict['test_set_y']
        X_test = results_dict['test_set_X']
        y_test = results_dict['test_set_y']
        # find model predicted values and probabilities on the testing data
        y_pred = best_estimator.predict(X_test)
        y_prob = best_estimator.predict_proba(X_test)

        # RECORD CONFUSION MATRIX
        # generate confusion matrix
        cm = confusion_matrix(y_test, y_pred, normalize=None)
        # output the evaluation metrics in a human-readable format
        # specify which data permutation we are using
        file.write(f'*{name}*\n')
        # record class order and the confusion matrix as calculated for the test data
        file.write(f'classes:{str(best_estimator.classes_)}\n')
        file.write(f'confusion matrix:\n{str(cm)}\n')

        # RECORD PERFORMANCE METRICS
        # evaluate and report model accuracy, precision, sensitivity, specificity
        metrics = ['accuracy', 'precision', 'sensitivity', 'specificity']
        for metric in metrics:
            met_vals = custom_success_metrics_by_class(y_test, y_pred, metric)
            file.write(f'{metric} by class: {str(met_vals)}\n')
            file.write(f'average {metric} (balanced, by class): {str(np.mean(met_vals))}\n')
        # evaluate Area Under the Curve
        # try both one-vs-one and one-vs-rest strategy for multi-class situations
        if is_multi_class:
            ovo = roc_auc_score(y_test, y_prob, multi_class='ovo')
            file.write(f'AUC (OVO): {str(ovo)}\n')
            ovr = roc_auc_score(y_test, y_prob, multi_class='ovr')
            file.write(f'AUC (OVR): {str(ovr)}\n')
        else:
            file.write(f'AUC: {str(roc_auc_score(y_test, y_prob[:,1]))}\n')

        # RECORD FEATURE IMPORTANCES
        # find permutation-based feature importances using train set
        features = list(combined_df.columns)
        file.write(f"Features: {features}\n")
        file.write("Permutation importance using training set:\n")
        pidicttrain = permutation_importance(best_estimator, X_train, y_train, scoring=metrics_dict_fi)
        for key, val in pidicttrain.items():
            towrite = dict(zip(features, val['importances_mean']))
            file.write(f"\tmean {key} importances: {str(towrite)}\n")
        # find permutation-based feature importances using test set
        file.write("Permutation importance using testing set:\n")
        pidicttest = permutation_importance(best_estimator, X_test, y_test, scoring=metrics_dict_fi)
        for key, val in pidicttest.items():
            towrite = dict(zip(features, val['importances_mean']))
            file.write(f"\tmean {key} importances: {str(towrite)}\n")
        # find averaged permutation-based feature importances
        # balanced average of evaluation on training and test set
        file.write("Final feature importances (balanced average over all previously reported values):\n")
        feature_vals = list()
        for bunch in [pidicttrain, pidicttest]:
            for metric in metrics:
                feature_vals.append(bunch[metric]['importances_mean'])
        feature_vals = np.stack(feature_vals)
        feature_means = np.mean(feature_vals, axis=0)
        feature_means_dict = dict(zip(features, feature_means))
        file.write("\t" + str(feature_means_dict) + "\n")
        file.write("Features in order of final importance:\n")
        feature_importance_list = sorted(feature_means_dict.items(), key=lambda k: feature_means_dict[k[0]], reverse=True)
        for i, pair in enumerate(feature_importance_list):
            feature, importance = pair
            dot = "."
            file.write(f"\t{str(i+1)+dot:<5} {feature:<35} {importance}\n")

        # RECORD MODEL HYPERPARAMETERS
        file.write("Model details:\n")
        file.write(str(results_dict['best_params']) + '\n')
        file.write('\n\n')