In [None]:
import os
from collections import defaultdict

import numpy as np
from scipy.stats import spearmanr, kendalltau, pearsonr
from scipy.cluster import hierarchy
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize, LabelEncoder, robust_scale
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

In [None]:
# util functions for ellipses around clusters
# modified from calmutils.localization.util

def get_ellipse_params(cov, n_sdev=1):
    """
    get ellipse parameters for matplotlib for an ellipse representing the
    full-width-at-quantile of a 2d-Gaussian

    Parameters
    ----------
    cov: np-array
        2x2 covaraince matrix
    n_sdev: float
        number of s.d.s at which to draw ellipse

    Returns
    -------
    a: float
        horizontal axis length
    b: float
        vertical axis length
    alpha: float \in (-180, 180)
        counterclockwise rotation of the ellipse in degrees
    """
    w, v = np.linalg.eig(cov)
    lens = [np.sqrt(wi) * 2 * n_sdev for wi in w]
    a = _deg_angle(v[:, 0] * lens[0])
    return (lens[0], lens[1], a)


def _deg_angle(a):
    """
    angle between vector a and x-axis, in degrees
    """
    return 180 * np.arctan2(a[1], a[0]) / np.pi

In [None]:
# load CellProfiler output

#obj_file = '/Users/david/Desktop/ageing_examples_tiff/MyExpt_IdentifyPrimaryObjects.csv'
# obj_file = '/Users/david/Desktop/examples_tiff_50intensity_8bit/MyExpt_FULLSize_ALL_IdentifyPrimaryObjects.csv'
# obj_file = '/Users/david/Downloads/examples_tiff_300new_8bit/run2/MyExpt_FULLSize_ALL_IdentifyPrimaryObjects.csv'
obj_file = '/Users/hoerl/Desktop/examples_tiff_50intensity_8bit/MyExpt_FULLSize_ALL_IdentifyPrimaryObjects.csv'
#img_file = '/Users/david/Desktop/ageing_examples_tiff/MyExpt_Image.csv'
# img_file = '/Users/david/Desktop/examples_tiff_50intensity_8bit/MyExpt_FULLSize_ALL_Image.csv'
# img_file = '/Users/david/Downloads/examples_tiff_300new_8bit/run2/MyExpt_FULLSize_ALL_Image.csv'
img_file = '/Users/hoerl/Desktop/examples_tiff_50intensity_8bit/MyExpt_FULLSize_ALL_Image.csv'

obj_df = pd.read_csv(obj_file)
img_df = pd.read_csv(img_file)

In [None]:
obj_df.columns

# fix win pathnames
img_df.PathName_DNA = np.vectorize(lambda p: p.replace('\\', '/'))(img_df.PathName_DNA.values)

In [None]:
# get features from object df
feat_names = [c for c in obj_df.columns if c.startswith('RadialDistribution') 
              or c.startswith('Intensity')
              or c.startswith('Granularity')
               or c.startswith('Texture')
              or c.startswith('AreaShape')
              and not 'NormalizedMoment' in c and not 'EulerNumber' in c]
#feat_names = [c for c in obj_df.columns if c.startswith('Texture')]
tex_values = obj_df[feat_names].values

# scale features
tex_values = robust_scale(tex_values) # probably not necessary, but prevents warning in normal scale
tex_values = scale(tex_values)

# get image path/name from img_df
#rep_id = [p.rsplit('/')[-1][:4] for p in img_df.URL_DNA.values[obj_df.ImageNumber.values - 1]]
rep_id = [p.rsplit('/')[-1].split('_')[1] for p in img_df.URL_DNA.values[obj_df.ImageNumber.values - 1]]
path_names = [p.rsplit('/')[-1] + '_' + q for p,q in zip(img_df.PathName_DNA.values[obj_df.ImageNumber.values - 1], rep_id)]
# TODO: make sure this works with win and unix paths
path_names_all = path_names
# get number of images for each class
np.unique([p.rsplit('_', 1)[0] for p in path_names], return_counts=True)
#path_names

In [None]:
# selected_classes = ['2020622_IMR90_untreated_old', '2020625_IMR90_3d_ICM_young', '2020629_IMR90_6d_ICM_young', '2020702_IMR90_9d_ICM_young', '2020705_IMR90_young_untreated']
# selected_classes = ['2020622_IMR90_untreated_old', '2020705_IMR90_young_untreated']
# selected_classes = ['2020622_IMR90_untreated_old', '20201208_IMR90_3day', '20201214_IMR90_9day']
# selected_classes = ['20201208_IMR90_3day', '20201214_IMR90_9day']
selected_classes = ['2020622_IMR90_untreated_old', '2020705_IMR90_young_untreated', '20210326_IMR90_young_untr', '20210402_IMR90_old']

tex_values = tex_values[[p.rsplit('_', 1)[0] in selected_classes for p in path_names]]
path_names = [p for p in path_names if p.rsplit('_', 1)[0] in selected_classes]

# get train/test split
tex_values, tex_values_test, path_names, path_names_test = train_test_split(tex_values, path_names, test_size=0.2, random_state=1337)
tex_values = tex_values[np.argsort(path_names)]
path_names = np.sort(path_names)
tex_values_test = tex_values_test[np.argsort(path_names_test)]
path_names_test = np.sort(path_names_test)

# TODO: re-sort
obj_df.columns.size

In [None]:
# check feature correlation
# following https://scikit-learn.org/dev/auto_examples/inspection/plot_permutation_importance_multicollinear.html

corr = spearmanr(tex_values).correlation
corr_linkage = hierarchy.ward(corr)

do_plot = False
if do_plot:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 12))
    dendro = hierarchy.dendrogram(
        corr_linkage, labels=feat_names, ax=ax1, leaf_rotation=90
    )
    dendro_idx = np.arange(0, len(dendro['ivl']))

    # ax1.set_ylim(0,70)
    # ax1.axhline(1)

    ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
    ax2.set_xticks(dendro_idx)
    ax2.set_yticks(dendro_idx)
    ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
    ax2.set_yticklabels(dendro['ivl'])
    fig.tight_layout()
    plt.show()

In [None]:
# keep only one feature per highly correlated group

group_thresh = 2

cluster_ids = hierarchy.fcluster(corr_linkage, group_thresh, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
cluster_id_to_feature_names = defaultdict(list)

for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
    cluster_id_to_feature_names[cluster_id].append(feat_names[idx])
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
len(selected_features)

In [None]:
cluster_id_to_feature_names

In [None]:
tex_values_selected = tex_values[:, selected_features]
tex_values_selected_test = tex_values_test[:, selected_features]
feat_names_selected = [feat_names[i] for i in selected_features]


In [None]:
# discard features that are more important for replicate differentiation
# compared to class differentiation

ys_betweenclass = LabelEncoder().fit_transform([p.rsplit('_', 1)[0] for p in path_names])
ys_inclass = LabelEncoder().fit_transform(path_names)
tex_values_filtered = tex_values_selected
feat_names_filtered = feat_names_selected

# how important the feature has to be for between-class
min_importance = 0.002

# how much more important does it have to be between-class vs. in class
min_importance_diff = 0.00

cls_between = LogisticRegression(max_iter=1000)
#cls_between = RandomForestClassifier() # RF/SVC give feature importance == 0 mostly
#cls_between = SVC()
cls_between.fit(tex_values_filtered, ys_betweenclass)

cls_inclass = LogisticRegression(max_iter=1000)
#cls_inclass = RandomForestClassifier()
#cls_inclass = SVC()
cls_inclass.fit(tex_values_filtered, ys_inclass)

res_between = permutation_importance(cls_between, tex_values_filtered, ys_betweenclass, n_repeats=100)
res_inclass = permutation_importance(cls_inclass, tex_values_filtered, ys_inclass, n_repeats=100)

# how much more important is the feature between vs in-class
importance_diff = res_between.importances_mean - res_inclass.importances_mean
sel = (res_between.importances_mean >= min_importance) & (importance_diff >= min_importance_diff)

feat_names_filtered = [feat_names_filtered[j] for j in range(len(feat_names_filtered)) if sel[j] ]
tex_values_filtered = tex_values_filtered[:,sel]
    
feat_names_filtered

In [None]:
#fig, axs = plt.subplots(nrows=2)

plt.plot(np.max(np.abs(cls_between.coef_), axis=0))
plt.plot(np.max(np.abs(cls_inclass.coef_), axis=0))

imp_ = np.argsort(- np.max(np.abs(cls_between.coef_), axis=0) )
list(np.sort(np.array(feat_names_selected)[imp_][:10]))

In [None]:
# heatmap of features

feat_names_for_plot = feat_names_filtered
tex_values_for_plot = tex_values_filtered

fig, ax = plt.subplots(figsize=(20,20))
ax.imshow(tex_values_for_plot, cmap='RdBu')

ax.set_xticks(np.arange(len(feat_names_for_plot)))
_ = ax.set_xticklabels(feat_names_for_plot, rotation=90)
ax.set_yticks(np.arange(len(path_names)))
_ = ax.set_yticklabels(path_names)

In [None]:

tex_values_for_cls = tex_values_selected
tex_values_for_cls_test  = tex_values_selected_test

le = LabelEncoder()
ys = le.fit_transform([p.rsplit('_', 1)[0] for p in path_names])
ys_test = le.transform([p.rsplit('_', 1)[0] for p in path_names_test])

cls_logistic = LogisticRegression(max_iter=1000)
cls_rf = RandomForestClassifier(n_estimators=100)
cls_linsvc = LinearSVC(max_iter=20000)
cls_logistic.fit(tex_values_for_cls, ys)
cls_rf.fit(tex_values_for_cls, ys)
cls_linsvc.fit(tex_values_for_cls, ys)

print('Logistic acc:', np.mean(cls_logistic.predict(tex_values_for_cls_test) == ys_test))
print('RF acc:', np.mean(cls_rf.predict(tex_values_for_cls_test) == ys_test))
print('LinSVC acc:', np.mean(cls_linsvc.predict(tex_values_for_cls_test) == ys_test))

In [None]:
plt.figure(figsize=(10,10))
plt.plot(np.linalg.norm(cls_logistic.coef_, axis=0) / np.max(np.linalg.norm(cls_logistic.coef_, axis=0)), label='Logistic Regression Coeff')
plt.plot(cls_rf.feature_importances_ / np.max(cls_rf.feature_importances_), label='RF Gini importance')
plt.plot(np.linalg.norm(cls_linsvc.coef_, axis=0) / np.max(np.linalg.norm(cls_linsvc.coef_, axis=0)),  label='Linear SVC Coeff')
plt.xticks(np.arange(len(feat_names_selected)), feat_names_selected, rotation=90)
plt.legend()
from scipy.stats import rankdata

mean_rank = np.mean([rankdata(-np.linalg.norm(cls_logistic.coef_, axis=0)), rankdata(-np.linalg.norm(cls_linsvc.coef_, axis=0)), rankdata(-cls_rf.feature_importances_)], axis=0)

filtered_index = np.argsort(mean_rank)[:25]
[(feat_names_selected[r], r, [p for p in cluster_id_to_feature_names.values() if feat_names_selected[r] in p[0]]) for r in filtered_index]
[(feat_names_selected[r]) for r in filtered_index]

In [None]:
for selected_feature in [(feat_names_selected[r]) for r in filtered_index][:25]:
    
    plt.figure()
#     print([v for v in cluster_id_to_feature_names.values() if selected_feature in v])

#     class_order = ['2020705_IMR90_young_untreated','2020625_IMR90_3d_ICM_young',
#                    '2020629_IMR90_6d_ICM_young', '2020702_IMR90_9d_ICM_young', '2020622_IMR90_untreated_old']
    class_order = ['20201208_IMR90_3day', '20201214_IMR90_9day', '2020622_IMR90_untreated_old']

    sns.swarmplot(x=obj_df[selected_feature], y=[p.rsplit('_', 1)[0] for p in path_names_all], order=class_order, hue=[p.rsplit('_', 1)[1] for p in path_names_all])

In [None]:
tex_values_filtered = tex_values_selected[:, filtered_index]
tex_values_filtered_test = tex_values_selected_test[:, filtered_index]
feat_names_filtered = [feat_names_selected[i] for i in filtered_index]
tex_values_filtered.shape

In [None]:
tex_values_for_cls = tex_values_filtered
tex_values_for_cls_test  = tex_values_filtered_test
le = LabelEncoder()
ys = le.fit_transform([p.rsplit('_', 1)[0] for p in path_names])
ys_test = le.transform([p.rsplit('_', 1)[0] for p in path_names_test])

cls_logistic = LogisticRegression(max_iter=1000)
cls_rf = RandomForestClassifier(n_estimators=100)
cls_linsvc = LinearSVC(max_iter=10000)
cls_logistic.fit(tex_values_for_cls, ys)
cls_rf.fit(tex_values_for_cls, ys)
cls_linsvc.fit(tex_values_for_cls, ys)

print('Logistic acc:', np.mean(cls_logistic.predict(tex_values_for_cls_test) == ys_test))
print('RF acc:', np.mean(cls_rf.predict(tex_values_for_cls_test) == ys_test))
print('LinSVC acc:', np.mean(cls_linsvc.predict(tex_values_for_cls_test) == ys_test))


from sklearn.metrics import precision_score, recall_score, f1_score

print('precision')
print(precision_score(ys_test, cls_logistic.predict(tex_values_for_cls_test), average=None))
print(precision_score(ys_test, cls_rf.predict(tex_values_for_cls_test), average=None))
print(precision_score(ys_test, cls_linsvc.predict(tex_values_for_cls_test), average=None))
print('recall')
print(recall_score(ys_test, cls_logistic.predict(tex_values_for_cls_test), average=None))
print(recall_score(ys_test, cls_rf.predict(tex_values_for_cls_test), average=None))
print(recall_score(ys_test, cls_linsvc.predict(tex_values_for_cls_test), average=None))
print('F1')
print(f1_score(ys_test, cls_logistic.predict(tex_values_for_cls_test), average=None))
print(f1_score(ys_test, cls_rf.predict(tex_values_for_cls_test), average=None))
print(f1_score(ys_test, cls_linsvc.predict(tex_values_for_cls_test), average=None))

# le.inverse_transform(np.arange(5))

In [None]:
# t-SNE 2D embedding

tex_values_for_tsne = tex_values_filtered
# tex_values_for_tsne = tex_values_selected
seed = np.random.randint(0, 65000)

plt.figure(figsize=(8,8))
# ts = TSNE(perplexity=30, n_iter=3000, random_state=seed).fit_transform(tex_values_for_tsne)
ts = PCA(n_components=2).fit_transform(tex_values_for_tsne)
le = LabelEncoder()
ys_num = le.fit_transform(path_names)

cmap = sns.color_palette("husl", len(np.unique([p.rsplit('_', 1)[0] for p in path_names])))

#sns.scatterplot(x=ts.T[0], y=ts.T[1], hue=[p.rsplit('_', 1)[0] for p in path_names], style=[p.rsplit('_', 1)[-1] for p in path_names], )
sns.scatterplot(x=ts.T[0], y=ts.T[1],
                hue=[p.rsplit('_', 1)[0] for p in path_names], hue_order=list(sorted(np.unique([p.rsplit('_', 1)[0] for p in path_names]))), style=[p.rsplit('_', 1)[-1] for p in path_names],
                s=75, palette=cmap)
plt.xlabel('t-SNE comp. 1')
plt.ylabel('t-SNE comp. 2')

draw_ellipses = True

if draw_ellipses:
    # draw ellipses around each class
    for i, yi in enumerate(sorted(np.unique(path_names), key=lambda x: x.split('_' , 1)[-1])):

        #p = le.inverse_transform([yi])[0]
        color_idx = list(sorted(np.unique([p.rsplit('_', 1)[0] for p in path_names]))).index(yi.rsplit('_', 1)[0])
#         print(yi, color_idx)


        t = ts[[p == yi for p in path_names]]

        # do not draw for replicates with single observation
        # would error on ellipse parameter determination
        if t.shape[0] < 2:
            continue

        a, b, alpha = get_ellipse_params(np.cov(t, rowvar=False), 1)
        ell = Ellipse(np.mean(t, axis=0), a, b, alpha, color=cmap[color_idx], fill=None, ls='dashed', lw=1.5)
        plt.gca().add_artist(ell)
    
seed

In [None]:
# t-SNE 2D embedding with buildup

tex_values_for_tsne = tex_values_filtered
# seed = np.random.randint(0, 65000)
seed = 1747


ts = TSNE(perplexity=30, n_iter=3000, random_state=seed).fit_transform(tex_values_for_tsne)
le = LabelEncoder()
ys_num = le.fit_transform(path_names)

cmap = sns.color_palette("husl", len(np.unique([p.rsplit('_', 1)[0] for p in path_names])))

paths_ordered = list(sorted(np.unique([p.rsplit('_', 1)[0] for p in path_names])))
subsets_to_plot = [[0,4], [0,1,4], [0,1,2,4], [0,1,2,3,4]]
for k, subset in enumerate(subsets_to_plot):
    plt.figure(figsize=(8,8))

    paths_ordered_k = [p for ki, p in enumerate(paths_ordered) if ki in subset]
    path_names_k = [p for p in path_names if p.rsplit('_', 1)[0] in paths_ordered_k]
    t = ts[[p.rsplit('_', 1)[0] in paths_ordered_k for p in path_names]]
    cmap_k = [c for ki, c in enumerate(cmap) if ki in subset]
    

    #sns.scatterplot(x=ts.T[0], y=ts.T[1], hue=[p.rsplit('_', 1)[0] for p in path_names], style=[p.rsplit('_', 1)[-1] for p in path_names], )
    sns.scatterplot(x=t.T[0], y=t.T[1],
                    hue=[p.rsplit('_', 1)[0] for p in path_names_k], hue_order=paths_ordered_k, style=[p.rsplit('_', 1)[-1] for p in path_names_k],
                    s=75, palette=cmap_k)

    plt.xlabel('t-SNE comp. 1')
    plt.ylabel('t-SNE comp. 2')

    draw_ellipses = False

    if draw_ellipses:
        # draw ellipses around each class
        for i, yi in enumerate(sorted(np.unique(path_names), key=lambda x: x.split('_' , 1)[-1])):

            #p = le.inverse_transform([yi])[0]
            color_idx = list(sorted(np.unique([p.rsplit('_', 1)[0] for p in path_names]))).index(yi.rsplit('_', 1)[0])
    #         print(yi, color_idx)


            t = ts[[p == yi for p in path_names]]

            # do not draw for replicates with single observation
            # would error on ellipse parameter determination
            if t.shape[0] < 2:
                continue

            a, b, alpha = get_ellipse_params(np.cov(t, rowvar=False), 1)
            ell = Ellipse(np.mean(t, axis=0), a, b, alpha, color=cmap[color_idx], fill=None, ls='dashed', lw=1.5)
            plt.gca().add_artist(ell)
    

    plt.rcParams['pdf.fonttype'] = 42
#     plt.savefig('C:/Users/david/Desktop/tsne{}.pdf'.format(k))
    
seed

In [None]:
sorted(np.unique(path_names), key=lambda x: x.split('_' , 1)[-1])

In [None]:
# heatmap of pca of features

tex_values_for_pca = tex_values_selected

pca = PCA().fit_transform(tex_values_for_pca)
n_comps = 16
fig, ax = plt.subplots(figsize=(20,20))
ax.imshow(pca[:,:n_comps], cmap='seismic')

ax.set_xticks(np.arange(n_comps))
_ = ax.set_xticklabels(['PC_{}'.format(i) for i in range(n_comps)], rotation=90)
ax.set_yticks(np.arange(len(path_names)))
_ = ax.set_yticklabels(path_names)