In [None]:
import os
import re
import json
from operator import add
from functools import reduce

from scipy.optimize import leastsq
from dateutil import parser
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot as plt

## load unsorted dataset

In [None]:
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_all_replicatenorm.csv')
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_all_replicatenorm_confocalblur.csv')
df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_all_imagenorm.csv')
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_all_imagenorm_confocalblur.csv')

# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220829_glcm-long_all_replicatenorm.csv')

## table cleaning

We first unify some dataset/replicate naming

In [None]:
# helper function to fix some incorrectly formatted dates 
correct_len = lambda s: s if len(s) == 8 else s.replace('2020', '20200')

# parse preparation date and replicate id from filename 
dates = df.filename.apply(lambda s: correct_len(s.split('/')[-3].split('_')[0]))
replicate = df.filename.apply(lambda s: 'rep' + re.split('rep', s.split('/')[-2])[-1].replace('_', '')[:1])

# combine (biological replicate) preparation date + replicate id -> technical replicate identifier
replicate = dates + '_' + replicate

# hacky solution: add 'w' suffix to 6-well samples
replicate += np.where(df.filename.str.split('rep', expand=True)[1].str[1] == 'w', 'w', '')

# get cell_class (treatment/condition) from filename
cell_class = df.filename.apply(lambda s: '_'.join(s.split('/')[-3].split('_')[1:]))

# unify some condition names
cell_class = cell_class.str.replace('day', 'd_ICM_young')
cell_class[cell_class == 'IMR90_young_untr'] = 'IMR90_young_untreated'
cell_class[cell_class == 'IMR90_old'] = 'IMR90_untreated_old'
cell_class[cell_class == 'IMR90_6d_ICM'] = 'IMR90_6d_ICM_young'
cell_class[cell_class == 'IMR90_young'] = 'IMR90_young_untreated'

# condition:= biological replicate id
df['condition'] = dates + '_' + cell_class
df['replicate'] = replicate
df['cell_class'] = cell_class

# add foreground mean_value
# newer versions of GLCM normalize intenisty to 0-1 instead of old versions that did 0-255
# can be detected through presence of 'perc_{low/high}_image' column
rescale_factor = 1.0 if 'perc_low_image' in df.columns else 255.0

df['fg_mean'] = df.intensity_mu / rescale_factor * (df.perc_high - df.perc_low) + df.perc_low

In [None]:
df.cell_class.unique()

## load manual sorting from JSON

soring JSON files should contain lists of manually classified examples and follow the format: 
```
{
   'good' : [
       [filename, hdf5_dataset_name],
       ...
   ],
   'bad': [
       ...
    ]
}
```

In [None]:
with open('/scratch/hoerl/auto_sir_dna_comp/sorting20211115_resorted.json', 'r') as fd:
    sorting_dict = json.load(fd)

def get_classification_from_dict(row, sorting_dict):
    filename = os.path.split(row.filename)[1].replace('.h5', '')
    dataset_name = row.dataset_name

    if [filename, dataset_name] in sorting_dict['good']:
        return 'good'
    elif [filename, dataset_name] in sorting_dict['bad']:
        return 'bad'
    else:
        return 'unclassified'

df['classification_manual'] = df.apply(lambda row: get_classification_from_dict(row, sorting_dict), 1)

In [None]:
# simple count of items per class
for k, v in sorting_dict.items():
    print(k, len(v))

### get features for classification from data

In [None]:
# get only feature column values
Xs = df.drop(columns=['filename', 'dataset_name', 'condition', 'replicate', 'cell_class', 'classification_manual']).values

# we have some NaNs, impute them
Xs = Imputer().fit_transform(Xs)
# scale
Xs = StandardScaler().fit_transform(Xs)

# only get Xs for which we have a manual classification
has_goodbad = df.classification_manual.apply(lambda r: r in ['good', 'bad']).values
Xs_goodbad = Xs[has_goodbad]

# encode good/bad as 1/0
# NOTE: we manually fit to ensure good=1
le_goodbad = LabelEncoder().fit(['bad', 'good'])
ys_goodbad = le_goodbad.transform(df.classification_manual.values[has_goodbad])

### init classifier and get total accuracy

In [None]:
model = RandomForestClassifier()

# get classifier performance via cross validation
np.mean(cross_val_score(model, Xs_goodbad, ys_goodbad))

### PR-curve and optimal threshold selection

We select probability threshold so we achieve a target precision

In [None]:
probs = cross_val_predict(model, Xs_goodbad, ys_goodbad, method='predict_proba')
prec, rec, thresh = precision_recall_curve(ys_goodbad, probs[:,1])

# get lowest threshold with precision over target of 0.95
prec_target = 0.95
idx = np.argmax(prec > prec_target)

plt.plot(rec, prec)
plt.annotate(f'precision: {prec[idx]}\nrecall: {rec[idx]}\n@thresh: {thresh[idx]}', [rec[idx], prec[idx]],
             arrowprops=dict(facecolor='black', shrink=0.05), xytext=(0.2, 0.85));
plt.xlabel('Recall');
plt.ylabel('Precision');
plt.title('PR-curve: good/bad classification');

### infer quality on the rest of the table

In [None]:
# default: pick probability threshold to reach target precision
prob_tresh = thresh[idx]
# prob_tresh = 0.5

# fit good/bad model on whole train set
model_goodbad = model
model_goodbad.fit(Xs_goodbad, ys_goodbad)

# get prediction for all other rows
ys_pred = model_goodbad.predict_proba(Xs)[:,1] > prob_tresh
df['classification_auto'] = le_goodbad.inverse_transform(ys_pred * 1)

# subset of good datapoints
df_good = df[df['classification_auto'] == 'good']

In [None]:
# feature importance
sorted(zip(model_goodbad.feature_importances_, df.drop(['filename', 'dataset_name', 'condition', 'replicate', 'cell_class'], axis=1).columns), reverse=True)

## Save dataframe
Output of this is used in ```ananlysis_v2.ipynb```

In [None]:
# save the good examples to CSV
df_good.to_csv('/scratch/hoerl/auto_sir_dna_comp/20220829_glcm-long_good95_replicatenorm.csv', index=False)

In [None]:
# save whole df with classification, e.g. to look at examples
# df.to_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_all_imagenorm_withcls.csv')

## Sorting statistics
optionally only for images older than a given date

In [None]:
df_datfiltered = df
# df_datfiltered = df[df.replicate.apply(lambda s: parser.parse(s.split('_')[0])) < parser.parse('20201201')]
# df_datfiltered = df[df.replicate.apply(lambda s: parser.parse(s.split('_')[0])) > parser.parse('20210801')]

groupby_rows = ['cell_class', 'condition']
# groupby_rows = ['cell_class']

badsum = df_datfiltered[df_datfiltered['classification_auto'] == 'bad'].groupby(groupby_rows).classification_auto.describe()
goodsum = df_datfiltered[df_datfiltered['classification_auto'] == 'good'].groupby(groupby_rows).classification_auto.describe()

summary = pd.DataFrame({'bad': badsum.freq, 'good': goodsum.freq})
summary = summary.fillna(value=0)
summary['total'] = summary.good + summary.bad
summary

## Intenisty histograms

In [None]:
# gerneral plot parameters
figsize = (15, 10)
xlim_for_plot = (0, 750)
bins_for_plot = 50

plot_vline = False
vline_location = 50

In [None]:
# 1) Histograms per cell_class

plt.figure(figsize=figsize)
axs = df_good.hist('fg_mean', by=['cell_class',], ax=plt.gca(), sharex=True, bins=bins_for_plot, density=True)
for ax in axs.flat:
    ax.set_xlim(xlim_for_plot)
    if plot_vline:
        ax.axvline(vline_location, color='red')

In [None]:
# 2) histograms grouped by replicate

fig, axs = plt.subplots(2, 3, sharex=True, figsize=figsize)
for (i, dfi), ax in zip(df_good.groupby('cell_class'), axs.flat):
    for (rep, dfj) in dfi.groupby('replicate'):
        ax.hist(dfj.fg_mean.values, density=True, alpha=0.5,
                bins=np.linspace(*xlim_for_plot, bins_for_plot), label=rep)
    ax.set_title(i)
    ax.set_xlim(xlim_for_plot)
    if plot_vline:
        ax.axvline(vline_location, color='red')
    ax.legend()

# hide unnecessary last subplot
axs[-1,-1].set_visible(False)

In [None]:
# 3) plot histogram outlines

fig, axs = plt.subplots(2, 3, sharex=True, figsize=figsize)
for (i, dfi), ax in zip(df_good.groupby('cell_class'), axs.flat):
    for (rep, dfj) in dfi.groupby('replicate'):
        h, bins = np.histogram(dfj.fg_mean.values, density=True, bins=np.linspace(*xlim_for_plot, bins_for_plot+1))
        ax.plot((bins[1:] + bins[:-1])/2, h,label=rep)
    ax.set_title(i)
    ax.set_xlim(xlim_for_plot)
    if plot_vline:
        ax.axvline(vline_location, color='red')
    ax.legend()
    
axs[-1,-1].set_visible(False)

# Optional: Subset of data based on intensity

We also tried subsequent steps only on subsets of the data with similar intensities.

The following cells can be used to create a subset of the data with either mean foreground intensity close to a predefined target value or find an intensity value for which we have the smallest deviation across the whole dataset

## 1) Pick subsample closest to target value

In [None]:
n_per_class = 150
target_value = 50

df_good['diff'] = np.abs(df_good.fg_mean - target_value)

# for each (biological) replicate, pick the datapoints with the smallest difference to target_value
dfis = []
for i, dfi in df_good.groupby('condition'):
    dfis.append(dfi.sort_values('diff').iloc[:n_per_class])

df_selected = pd.concat(dfis).drop(['diff'], 1)

In [None]:
df_selected.to_csv('C:/Users/hoerl/Downloads/20210419_glcm_all_extrafeats_selected50intensity.csv', index=False)

## 2) pick subsample and target value with minimal total differences

In [None]:
classes = df_good.cell_class.unique()
n_best = 250

def get_sum_diff(target_intensity, df_good):
    dfs = {}
    df_good = df_good.copy()
    df_good['diff'] = np.abs(df_good.fg_mean - target_intensity)
#     for idx, dfi in df_good.groupby(['cell_class', 'replicate']):
    for idx, dfi in df_good.groupby(['cell_class']): 
        df_best = dfi.sort_values('diff').reset_index(drop=True).loc[:(n_best-1), :]
        dfs[idx] = df_best

    sum_dev = reduce(add, [np.sum(v['diff']) for k, v in dfs.items()])
    return sum_dev, dfs

# single parameter version
f = lambda ti : get_sum_diff(ti, df_good)[0]

# optimize target_intensity
optimal_ti, _ = leastsq(f, 50)
_, dfs = get_sum_diff(optimal_ti, df_good)
optimal_ti

In [None]:
df_selected = reduce(pd.DataFrame.append, [df for _, df in dfs.items()]).drop(['diff'], 1)
df_selected.to_csv('/Users/david/Downloads/20210707_glcm_good_selected_intensity_percondition.csv', index=False)