In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    ConfusionMatrixDisplay,
    classification_report,
)
from xgboost import XGBClassifier, plot_importance

from graph_neuralmorpho.morphometrics.morphopy import MorphopyFeatures

In [None]:
data_path = Path("/Users/kyle/Library/CloudStorage/Box-Box/FoxLab/data-microscopy/neuromorpho-benchmark")
morphopy_features_path = data_path / "processed/morphopy_features"

morphopy_features = MorphopyFeatures(morphopy_features_path)
radius_measure_masks = morphopy_features.data.columns.str.contains(r"thickness|volume|surface")
cols = morphopy_features.data.columns[~radius_measure_masks]
features = morphopy_features.data.copy()[cols]
features = features.dropna(axis=1)
neuron_name = morphopy_features.neurons
target = morphopy_features.target

In [None]:
# drop_idxs = np.where((target == 2) | (target == 4))[0]
# features.drop(drop_idxs, inplace=True)
# target.drop(drop_idxs, inplace=True)
# target = target.cat.remove_unused_categories()
# neuron_name.drop(drop_idxs, inplace=True)
# target.replace(morphopy_features.label_dict, inplace=True)

## K-fold cross-validation

## MorphoPy feature classification

In [None]:
from sklearn.model_selection import StratifiedKFold
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    random_state=42,
    stratify=target,
)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score

bst = XGBClassifier()# tree_method="hist")
scores = cross_val_score(bst, X_train, y_train, cv=kf.split(X_train, y_train), scoring="accuracy")

In [None]:
# shuffle targets 
target_shuffled = target.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    # random_state=42,
    stratify=target,
)
# train model
bst = XGBClassifier()
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

print(f"MorphoPy model accuracy: {accuracy_score(y_test, preds).round(4)}")

In [None]:
plot_importance(bst)

In [None]:
conf_mat = confusion_matrix(y_test, preds)
# conf_mat = confusion_matrix(y_test, preds, normalize="true")
conf_disp = ConfusionMatrixDisplay(conf_mat, display_labels=np.unique(target))
conf_disp.plot()


In [None]:
# # take repeated samples of pyramidal cells and run classification
# rand_states = np.random.randint(0, 10000, size=1000)
# sample_state = {}

# for state in rand_states:
#     data = features.df_scaled.drop(columns="early_branch_path")
#     # grab subset of pyramidal neurons to match interneuron count
#     num_interneurons = metadata_labels.query("pyramidal == 0").shape[0]
#     pyramidal_names = metadata_labels.query("pyramidal == 1").sample(num_interneurons, random_state=state)[
#         "neuron_name"
#     ]
#     interneuron_names = metadata_labels.query("pyramidal == 0")["neuron_name"]
#     cell_names = pyramidal_names.to_list() + interneuron_names.to_list()
#     cell_idxs = np.where(features.neuron_names.isin(cell_names))[0]
#     data = data.iloc[cell_idxs]
#     target = metadata_labels[metadata_labels["neuron_name"].isin(cell_names)]["pyramidal"]

#         # split data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         data, target, test_size=0.2, random_state=42, stratify=target
#     )
#     # train model
#     bst = XGBClassifier(
#         # n_estimators=3,
#         # max_depth=100,
#         # eval_metric="accuracy"
#     )
#     # fit model
#     bst.fit(X_train, y_train)
#     # make predictions
#     preds = bst.predict(X_test)
#     sample_state[state] = accuracy_score(y_test, preds)a