In [None]:
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# from openTSNE import TSNE as oTSNE
# from umap import UMAP

In [None]:

## 1) read both texture and other feature tables, join

csv_feats_texture = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/texture_feats.csv'
# csv_feats_texture = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/texture_feats_normalized_per_cell.csv'
csv_feats_other = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/other_feats.csv'

df = pd.read_csv(csv_feats_texture).set_index(['file', 'label'])
df = df.merge(pd.read_csv(csv_feats_other), on=['file', 'label'])

df['file_stem'] = df.reset_index().file.apply(lambda f: Path(f).stem)

In [None]:
exp_overview_csv = '/scratch/hoerl/auto_sir_experiment_overview.csv'
df_exp_overview = pd.read_csv(exp_overview_csv, sep=';')[['file', 'treatment', 'replicate_technical', 'replicate_biological']]
df_exp_overview['replicate_technical'] = df_exp_overview['replicate_technical'].apply(str)
df_exp_overview['replicate_biological'] = df_exp_overview['replicate_biological'].apply(str)

# df = df.set_index(['file_stem'])
df = df.merge(df_exp_overview, left_on='file_stem', right_on='file', suffixes=(None, '_duplicate') )

In [None]:
# set as index again
df = df.set_index(['file', 'label'])

df

In [None]:
df = df[df.replicate_biological == '1']

In [None]:
## 2) prepare features

# get all columns that start with 'tex' --> texture features
tex_values = df[[c for c in df.columns if c.startswith('tex')] + ['other_mean_intensity', 'other_area', 'other_eccentricity']].values

# we have NaNs -> impute
# probably not necessary here, but copied from above anyway
tex_values = SimpleImputer().fit_transform(tex_values)

# normalize features
scaler = StandardScaler()
tex_values = scaler.fit_transform(tex_values)


# encode labels
le = LabelEncoder()
ys = le.fit_transform(df.treatment)

# make one-hot encoded ys for OvR classification
binarizer = LabelBinarizer()
onehot_y = binarizer.fit_transform(ys)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold, LeaveOneGroupOut

figsize = (8, 8)
fig, ax = plt.subplots(figsize=figsize)

cls = SVC(C=100, probability=True)


cv_strategies = (
    (StratifiedKFold(10, shuffle=True), None, '10-Fold Stratified CV'),
    (LeaveOneGroupOut(), [c + '_' + r1 + '_' + r2 for c, r1, r2 in zip(df.treatment, df.replicate_biological, df.replicate_technical)], 'Leave-One-Replicate-Out (Technical)'),
    (LeaveOneGroupOut(), [c + '_' + r for c,r in zip(df.treatment, df.replicate_biological)], 'Leave-One-Replicate-Out (Biological)')
)

for cv, groups, desc in cv_strategies:
    cv.split(tex_values, ys, groups)

    prob_pred = cross_val_predict(cls, tex_values, ys, cv=cv, groups=groups, n_jobs=-1, method='predict_proba')
    pred = np.argmax(prob_pred, axis=1)

    # we have only 2 classes -> use probabilities for class 1 for further analysis
    if prob_pred.shape[1] == 2:
        prob_pred = prob_pred[:,1:2] # 1-column selection

    # get accuracy, AP & PR curve
    overall_acc = (pred == ys).sum() / len(ys)
    pr, re, _ = precision_recall_curve(onehot_y.ravel(), prob_pred.ravel())
    ap = average_precision_score(onehot_y.ravel(), prob_pred.ravel())

    ax.plot(re, pr, label=f'{desc}\naccuracy={overall_acc:.3f}\nAP={ap:.3f}', lw=2)

# add "random guess" baseline to all plots
_, cts = np.unique(ys, return_counts=True)
baseline_pr = np.mean(cts/len(ys))

ax.plot([0, 1], [baseline_pr, baseline_pr], linestyle='dashed', color='firebrick', lw=2, label='random guess')
    
ax.set_xlim(0, 1)
ax.set_xlabel('Recall')
ax.set_ylim(0, 1)
ax.set_ylabel('Precision')
ax.legend()

In [None]:
# get cross-val prediction and string labels

cv, groups, _ = cv_strategies[0]

pred = cross_val_predict(cls, tex_values, ys, cv=cv, n_jobs=-1, groups=groups)
labs_pred = le.inverse_transform(pred)

In [None]:
conf_mat = defaultdict(lambda : np.zeros(np.max(ys) + 1))

# replicates_ = replicates
# replicates_ = df.replicate_biological
replicates_ = df.replicate_technical

# go through all predictions, increment corresponding row
for cond, repl, lab_pred in zip(df.treatment, replicates_, labs_pred):
    conf_mat[(cond, repl)][le.transform([lab_pred])[0]] += 1

# get sorted label + number of samples
input_cls = [s[0] + (f'N: {int(s[1].sum())}' ,) for s in sorted(conf_mat.items())]

# make matrix from dict, normalize per-row
mat = np.array([s[1] for s in sorted(conf_mat.items())])
mat = mat / np.sum(mat, axis=1).reshape((-1,1))

# plot as heatmap
plt.figure(figsize=(12,8))
plt.imshow(mat, cmap='Blues', aspect=0.2)
plt.yticks(ticks=np.arange(len(input_cls)), labels=[', '.join(c) for c in input_cls]);
plt.xticks(ticks=np.arange(np.max(ys) + 1), labels=le.inverse_transform(np.arange(np.max(ys) + 1)), rotation='vertical');


plt.rcParams['pdf.fonttype'] = 42
plt.tight_layout()

plt.colorbar(shrink=.8)