In [None]:
from functools import reduce

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
from pathlib import Path

## 1) read both texture and other feature tables, join

# NOTE: select global norm texture feats or per cell normalized
# csv_feats_texture = '/Users/david/Desktop/IMR90_30112022/segmentation_cellpose_d120_ft06/texture_feats.csv'
# csv_feats_texture = '/Users/david/Desktop/IMR90_30112022/segmentation_cellpose_d120_ft06/texture_feats_normalized_per_cell.csv'
# csv_feats_other = '/Users/david/Desktop/IMR90_30112022/segmentation_cellpose_d120_ft06/other_feats.csv'

csv_feats_texture = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/texture_feats.csv'
# csv_feats_texture = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/texture_feats_normalized_per_cell.csv'
csv_feats_other = '/scratch/hoerl/20230507_imr90_stitching/20230507_imr90_ov_stitch_output/segmentation_cellpose_maxproj_d70_ft07/other_feats.csv'

df = pd.read_csv(csv_feats_texture).set_index(['file', 'label'])
df = df.merge(pd.read_csv(csv_feats_other), on=['file', 'label'])

df['file_stem'] = df.reset_index().file.apply(lambda f: Path(f).stem)

In [None]:
exp_overview_csv = '/scratch/hoerl/auto_sir_experiment_overview.csv'

df_exp_overview = pd.read_csv(exp_overview_csv, sep=';')[['file', 'treatment', 'replicate_technical', 'replicate_biological', 'overlapping_tiles']]
df_exp_overview['replicate_technical'] = df_exp_overview['replicate_technical'].apply(str)
df_exp_overview['replicate_biological'] = df_exp_overview['replicate_biological'].apply(str)

df = df.merge(df_exp_overview, left_on='file_stem', right_on='file', suffixes=(None, '_duplicate') )

# remove non-overlapping overviews (run 3f2f7d32d280ce05293143834aa15a08)
df = df[df.overlapping_tiles]

# set as index again
df = df.set_index(['file', 'label'])

# only use single replicate
# df = df[df['replicate_biological'] == '1']

In [None]:
df

In [None]:
oldyoung_classes = ['young', 'old']

df_oldyoung = df[df.treatment.isin(oldyoung_classes)]
df_treated = df[~df.treatment.isin(oldyoung_classes)]

In [None]:
## 2) prepare features

# get all columns that start with 'tex' --> texture features
# plus mean intensity, area, eccentricity
feature_cols = [c for c in df.columns if c.startswith('tex')] + ['other_mean_intensity', 'other_area', 'other_eccentricity']

tex_values_oldyoung = df_oldyoung[feature_cols].values
tex_values_treated = df_treated[feature_cols].values

# we have NaNs -> impute
# probably not necessary here, but copied from above anyway
tex_values_oldyoung = SimpleImputer().fit_transform(tex_values_oldyoung)
tex_values_treated = SimpleImputer().fit_transform(tex_values_treated)

# normalize features
scaler = StandardScaler()
tex_values_oldyoung = scaler.fit_transform(tex_values_oldyoung)
tex_values_treated = scaler.transform(tex_values_treated)

le_oldyoung = LabelEncoder()
y_oldyoung = le_oldyoung.fit_transform(df_oldyoung.treatment)

In [None]:
# SVC, C<1 -> stronger regularization as default
cls = SVC(C=0.1, probability=True, class_weight='balanced')

# cls = LogisticRegression(max_iter=1000, class_weight='balanced')

# CV scoring to assess classifier performance on old/young
cv = StratifiedKFold(5, shuffle=True)
y_oldyoung_pred = cross_val_predict(cls, tex_values_oldyoung, y_oldyoung, n_jobs=-1, cv=cv)
np.mean(y_oldyoung_pred == y_oldyoung)

In [None]:
# fit again on whole old/young dataset
cls.fit(tex_values_oldyoung, y_oldyoung);

In [None]:
young_old_indices = le_oldyoung.transform(oldyoung_classes)
young_old_indices

In [None]:
y_treated_pred = cls.predict_proba(tex_values_treated)

df_pred = pd.DataFrame()

df_pred['cell_class'] = df_treated.treatment
df_pred['replicate_bio'] = df_treated.replicate_biological
df_pred['replicate_tech'] = df_treated.replicate_technical

df_pred['prob_young'] = y_treated_pred.T[young_old_indices[0]]
df_pred['prob_old'] = y_treated_pred.T[young_old_indices[1]]

In [None]:
import seaborn as sns

## group by technical or biological replicates
# df_grouped = df_pred.groupby(['cell_class', 'replicate_tech'])[['prob_young', 'prob_old']]
df_grouped = df_pred.groupby(['cell_class', 'replicate_bio'])[['prob_young', 'prob_old']]

# average prediction per group
df_confmat = df_grouped.mean()

# add replicate size to index -> for labelling in plot
df_confmat['count'] = list(map(lambda c: f'N={c}', df_grouped.count().iloc[:,0].values))
df_confmat = df_confmat.set_index('count', append=True)

plt.figure(figsize=(6,8))
sns.heatmap(df_confmat, cmap='Blues', annot=True, vmin=0, vmax=1)
plt.yticks(ticks=range(len(df_confmat)), labels=map(', '.join, df_confmat.index))
# plt.xticks(ticks=range(len(oldyoung_classes)), labels=oldyoung_classes, rotation='vertical');
plt.xticks(ticks=range(len(oldyoung_classes)), labels=oldyoung_classes);

# save
plt.rc('pdf', fonttype='42')
plt.savefig('/home/hoerl/ageing_dna_texture_figure_parts/confusionmatrix_oldyoung-classification_confocal.pdf')

In [None]:
df_confmat