In [None]:
from functools import reduce
import random
from itertools import product
from concurrent.futures import ThreadPoolExecutor

import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.base import clone

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

In [None]:
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220125_glcm_good_lithreshold_smallblur.csv')
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20211115_glcm_good_lithreshold_smallblur.csv')
# df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20210719_glcm_good_all_confocalblur.csv')
df = pd.read_csv('/scratch/hoerl/auto_sir_dna_comp/20220816_glcm_good_replicatenorm_confocalblur.csv')

# rename 'IMR90_young' to 'IMR90_young_untreated'
df.cell_class = df.cell_class.replace(['IMR90_young'], 'IMR90_young_untreated')

# remove the "young" cells, as we do not have enough replicates with high mean intensity
# df = df[~ df.cell_class.isin(['IMR90_young_untreated'])]

# remove 3d, 9d -> we do not have enough biological replicates
# df = df[~ df.cell_class.isin(['IMR90_3d_ICM_young', 'IMR90_9d_ICM_young'])]

# keep only replicates above a certain mean fg intensity
# df = reduce(pd.DataFrame.append, [dfi for _,dfi in df.groupby(['cell_class', 'replicate']) if dfi.fg_mean.mean() > 100])

# give the replicates from last batch from 6-well extra 'w' suffix
# idx = df.filename.str.contains('data') & df.filename.str.contains('well')
# df.loc[idx, 'replicate'] += 'w'

# Optional: filter by foreground brightness
# df = df[df.fg_mean > 100]

# filter replicates with small number of cells
min_num_cells = 10
df = reduce(pd.DataFrame.append, [dfi for _, dfi in df.groupby(['cell_class', 'replicate']) if len(dfi) > min_num_cells])

In [None]:
df.groupby(['cell_class', 'condition']).fg_mean.describe()[['count', 'mean']]

In [None]:
# condition -> replicates dict
d = {k: dfi.replicate.unique() for k, dfi in df.groupby(['cell_class'])}

# get all possible ways of leaving one replicate per condition out
combos = list(product(*d.values()))
len(combos)

In [None]:
def run_split(df, combo, cls=SVC()):
    '''
    split df by using one replicate per cell_class as val set, the rest as train
    train a classifier and return validation score    
    '''
    
    # combo: list of (cell_class, replicate) tuples
    df_val = reduce(pd.DataFrame.append, [dfi for i, dfi in df.groupby(['cell_class', 'replicate']) if i in combo])
    df_train = reduce(pd.DataFrame.append, [dfi for i, dfi in df.groupby(['cell_class', 'replicate']) if i not in combo])

    # cell_class is target
    conditions = df_train.cell_class
    conditions_val = df_val.cell_class
    
    # to numeric labels
    le = LabelEncoder()
    y_train = le.fit_transform(conditions)
    y_val = le.transform(conditions_val)

    # get feature cols
    columns_to_drop = ['dataset_name', 'filename', 'classification_manual', 'classification_auto', 'replicate',
                       'cell_class', 'condition', 'img_height', 'img_width', 'mask_area',
#                       'intensity_mu','fg_mean', 'intensity_sigma', 
                       'perc_high', 'perc_low'
                      ]  # + [c for c in df.columns if not 'LBP' in c ]

    x_train = df_train.drop(columns_to_drop, 1).values
    x_val = df_val.drop(columns_to_drop, 1).values

    # impute NaNs and normalize
    x_train = SimpleImputer().fit_transform(x_train)
    x_val = SimpleImputer().fit_transform(x_val)

    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_val = sc.transform(x_val)

    # train new cls and val
    cls = clone(cls)
    cls.fit(x_train, y_train)
    score = cls.score(x_val, y_val)
    
    return score

In [None]:
# how many random splits to try
n_repeat = 200

# classifier to use
cls = RandomForestClassifier(300)

# sample combos and handle multithreaded
with ThreadPoolExecutor() as tpe:
    
    futures = []
    combo_sample = random.sample(combos, n_repeat)
    for combo in combo_sample:
        # list of (cond, rep) to leave out
        combo = list(zip(d.keys(), combo))
        futures.append(tpe.submit(run_split, df, combo, cls))

    # get results: scores
    scores = []
    for f in tqdm.tqdm(futures):
        scores.append(f.result())
        
np.mean(scores), np.std(scores)

In [None]:
# plot score histogram
plt.hist(scores, bins=25)

In [None]:
score_df = pd.DataFrame.from_dict(dict(zip(d.keys(), np.array(combo_sample).T)))
score_df['score'] = scores

for di in d.keys():
    print(score_df.groupby(di).score.median())

In [None]:
score_df.sort_values('score')

In [None]:
df.groupby(['cell_class', 'replicate']).fg_mean.median()

In [None]:
df.columns

In [None]:
from functools import reduce
from operator import add

from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
from sklearn.preprocessing import OneHotEncoder

group_columns = ['cell_class', 'replicate']
class_columns = ['cell_class']

cls = SVC(C=100.0, class_weight='balanced', probability=True)
# cls = SVC(C=100.0, probability=True)
# cls = RandomForestClassifier(n_estimators=300, class_weight='balanced')
# cls = RandomForestClassifier()

# NOTE: over/undersampling to balance classes did not really help...
# cls = make_pipeline(
#     RandomOverSampler(),    
#     cls
# )

# columns to drop from features
# filepaths, classes, good/bad cls & auxillariy features
columns_to_drop = ['dataset_name', 'filename', 'classification_manual', 'classification_auto', 'replicate',
                   'cell_class', 'condition',
                   'img_height', 'img_width', 'mask_area',
                   'num_blank_rows', 'num_blank_cols',
#                    'intensity_mu', 'intensity_sigma', 
                   'perc_high', 'perc_low', 'fg_mean',
                   'perc_high_image', 'perc_low_image'
                  ] 

X = df.drop(columns = columns_to_drop).values
X = SimpleImputer().fit_transform(X)
sc = StandardScaler()
X = sc.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df[class_columns].values.ravel())

groups = LabelEncoder().fit_transform(df[group_columns].apply(lambda r: reduce(add, r), axis=1))

prob = cross_val_predict(cls, X, y, cv=LeaveOneGroupOut(), groups=groups, n_jobs=16, method='predict_proba')
pred = np.argmax(prob, axis=1)

In [None]:
(pred == y).mean()

In [None]:
cls.fit(X, y)
cls.class_weight_

In [None]:
cls.get_params()

In [None]:
from collections import defaultdict

label_columns = ['cell_class', 'condition']
# label_columns = ['cell_class']

conf_mat = defaultdict(lambda : np.zeros(np.max(y) + 1))

# go through all predictions, increment corresponding row
for lab_pred, *grp in zip(pred, *df[label_columns].values.T,):
    conf_mat[tuple(grp)][lab_pred] += 1

# get sorted label + number of samples
input_cls = [s[0] + (f'N: {int(s[1].sum())}' ,) for s in sorted(conf_mat.items())]

# make matrix from dict, normalize per-row
mat = np.array([s[1] for s in sorted(conf_mat.items())])
mat = mat / np.sum(mat, axis=1).reshape((-1,1))

# plot as heatmap
plt.figure(figsize=(12,8))
plt.imshow(mat, cmap='Blues', aspect=0.2)
plt.yticks(ticks=np.arange(len(input_cls)), labels=[', '.join(c) for c in input_cls]);
plt.xticks(ticks=np.arange(np.max(y) + 1), labels=le.inverse_transform(np.arange(np.max(y) + 1)), rotation='vertical');


plt.rcParams['pdf.fonttype'] = 42
plt.tight_layout()

plt.colorbar(shrink=.8)

In [None]:
from collections import defaultdict

label_columns = ['cell_class', 'condition']
# label_columns = ['cell_class']

conf_mat = defaultdict(lambda : np.zeros(np.max(y) + 1))

# go through all predictions, increment corresponding row
for prob_, *grp in zip(prob, *df[label_columns].values.T,):
    conf_mat[tuple(grp)] += prob_

# get sorted label + number of samples
input_cls = [s[0] + (f'N: {int(s[1].sum())}' ,) for s in sorted(conf_mat.items())]

# make matrix from dict, normalize per-row
mat = np.array([s[1] for s in sorted(conf_mat.items())])
mat = mat / np.sum(mat, axis=1).reshape((-1,1))

# plot as heatmap
plt.figure(figsize=(12,8))
plt.imshow(mat, cmap='Blues', aspect=0.2)
plt.yticks(ticks=np.arange(len(input_cls)), labels=[', '.join(c) for c in input_cls]);
plt.xticks(ticks=np.arange(np.max(y) + 1), labels=le.inverse_transform(np.arange(np.max(y) + 1)), rotation='vertical');


plt.rcParams['pdf.fonttype'] = 42
plt.tight_layout()

plt.colorbar(shrink=.8)

In [None]:
from scipy.special import comb

comb(28, 5)

In [None]:
from itertools import combinations, islice
from math import comb


def nth(iterable, n, default=None):
    "Returns the nth item or a default value"
    return next(islice(iterable, n, None), default)

def sample_iterable(iterable, n, length):
    idxs = random.sample(range(length), n)
    print(idxs)
    return [nth(iterable, idx) for idx in idxs]

sample_iterable(combinations(prob, 2), 2, comb(len(prob), 5))

In [None]:
import random
from collections import defaultdict

def sample_combinations(N, k, m):
    """
    randomly sample m sets of indices to sample k elements from a length-N list/array
    """
    res = set()
    while(len(res) < m):
        res.add(tuple(sorted(random.sample(range(N), k))))
    return list(res)


k_images = 5
m_combos = 10000

label_columns = ['cell_class', 'replicate']
# label_columns = ['cell_class']

conf_mat = defaultdict(lambda : np.zeros(np.max(y) + 1))

df_ = df.copy()
df_[[f'prob{i}' for i in range(prob.shape[1])]] = prob

for c, dfi in df_.groupby(label_columns):
    probs_i = dfi[[f'prob{i}' for i in range(prob.shape[1])]].values
    
    for s in sample_combinations(len(probs_i), k_images, m_combos):
        pr_ = np.argmax(probs_i[list(s)].sum(axis=0))
        conf_mat[tuple(c)][pr_] += 1
        
# get sorted label + number of samples
input_cls = [s[0] + (f'N: {int(s[1].sum())}' ,) for s in sorted(conf_mat.items())]

# make matrix from dict, normalize per-row
mat = np.array([s[1] for s in sorted(conf_mat.items())])
mat = mat / np.sum(mat, axis=1).reshape((-1,1))

# plot as heatmap
plt.figure(figsize=(12,8))
plt.imshow(mat, cmap='Blues', aspect=0.2)
plt.yticks(ticks=np.arange(len(input_cls)), labels=[', '.join(c) for c in input_cls]);
plt.xticks(ticks=np.arange(np.max(y) + 1), labels=le.inverse_transform(np.arange(np.max(y) + 1)), rotation='vertical');


plt.rcParams['pdf.fonttype'] = 42
plt.tight_layout()

plt.colorbar(shrink=.8)

In [None]:
for (cl, rep), labs in conf_mat.items():
    print(cl, rep, labs[le.transform([cl])[0]] / labs.sum())

In [None]:
res = {}

replicates = [gr for gr, _ in df.groupby(group_columns)]
for i, replicate in enumerate(replicates):

    # use one replicate as val, rest as train
    df_val = df[(df[group_columns] == replicate).all(axis=1)]
    df_train = df[(df[group_columns] != replicate).any(axis=1)]
    
#     print(len(df_train), len(df_val), len(df), df_train.cell_class.unique())

    # cell_class is target
    conditions = df_train.cell_class
    conditions_val = df_val.cell_class
    
    # to numeric labels
    le = LabelEncoder()
    y_train = le.fit_transform(conditions)
    y_val = le.transform(conditions_val)

 # + [c for c in df.columns if not 'LBP' in c ]

    x_train = df_train.drop(columns_to_drop, 1).values
    x_val = df_val.drop(columns_to_drop, 1).values

    # impute NaNs and normalize
    x_train = SimpleImputer().fit_transform(x_train)
    x_val = SimpleImputer().fit_transform(x_val)

    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_val = sc.transform(x_val)

    # train new cls and val
    cls = clone(cls)
    cls.fit(x_train, y_train)
    score = cls.score(x_val, y_val)
    
    res[replicate] = score
    
    print(f'({i+1}/{len(replicates)}): {replicate}')
    
res

In [None]:
scores = list(res.values())
repl_size = [(df[group_columns] == replicate).all(axis=1).sum() for replicate in replicates]

(np.array(scores) * np.array(repl_size)).sum() / np.sum(repl_size) 