In [None]:
import glob
import os

import numpy as np
from matplotlib import pyplot as plt
from skimage.io import imread
from skimage.filters import threshold_otsu, gaussian
from skimage.transform import rescale
from skimage.morphology import remove_small_holes, remove_small_objects
from skimage.feature import greycomatrix, greycoprops

# 1) Get features from already resaved TIFF files

In [None]:
root = '/Users/david/Downloads/examples_tiff_300new_8bit/20201208_IMR90_3day/'
files = glob.glob(os.path.join(root, '*.tif'))

In [None]:
def load_img_and_segment(image_path):

    img = imread(image_path).squeeze()
    blur_sigma = 8 # amount of blur before thresholding

    # TODO: maybe downsample for speed?
    # img = rescale(img, 0.125, clip=False, preserve_range=True)



    g_ = gaussian(img, blur_sigma)
    mask = g_ > threshold_otsu(g_)

    # a bit of binary cleaning - TODO: check sizes?
    mask = remove_small_objects(mask, 512)
    mask = remove_small_holes(mask, 512)

    return img, mask

img, mask = load_img_and_segment(files[25])
fig, axs = plt.subplots(ncols=2)
axs[0].imshow(img)
axs[1].imshow(mask)

In [None]:
def get_glcm_features(img, mask):
    
    props = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']
    distances = [2, 3, 5, 7, 9]
    angles = [0, np.pi/2]
    
    # make input for masked GLCM:
    # 1) set bg to zero
    # 2) set everything else +1 
    # (NB: should not be necessary if mask comes from threshold, but let's keep it anyway)
    img_for_masked_glcm = img.copy().astype(np.uint16)
    img_for_masked_glcm[~ mask] = 0
    img_for_masked_glcm[mask] += 1 

    # get glcm, but ignore first row & column (co-ocurrence with 0 := background)
    glcm = greycomatrix(img_for_masked_glcm, distances, angles, 257)
    glcm = glcm[1:,1:]
    
    return np.stack([greycoprops(glcm, prop=p) for p in props])

get_glcm_features(img, mask).shape
# plt.imshow(img_for_masked_glcm)
# np.sum(glcm), np.prod(img.shape)
# np.sum(glcm), np.sum(mask)

In [None]:
# root = '/Users/david/Downloads/examples_tiff_300new_8bit/20201208_IMR90_3day/'
files = glob.glob(os.path.join('/Users/david/Downloads/examples_tiff_300new_8bit/20201208_IMR90_3day/', '*.tif'))
files += glob.glob(os.path.join('/Users/david/Downloads/examples_tiff_300new_8bit/20201214_IMR90_9day/', '*.tif'))
# files += glob.glob(os.path.join('/Users/david/Downloads/examples_tiff_300new_8bit/2020622_IMR90_untreated_old/', '*.tif'))

In [None]:
import tqdm
from concurrent.futures import ThreadPoolExecutor

def analysis(img_path):
    img, mask = load_img_and_segment(img_path)
    # use all-ones mask to disable masking
#     mask = np.ones(img.shape, dtype=bool)
    return get_glcm_features(img, mask)

res = []
with ThreadPoolExecutor() as tpe:
    futures = [tpe.submit(analysis, f) for f in files]
    for (f,p) in tqdm.tqdm(zip(futures, files), total=len(files)):
        res.append((p, f.result()))

In [None]:
feats = np.array([f for _,f in res]).reshape((len(files), -1))
conditions = [f.split(os.sep)[-2] for f in files]
replicates = [f.split(os.sep)[-1].split('_')[1] for f in files]

# ALTERNATIVE: load saved GLCM values from analysis directly from h5

Tables generated by ```glcm_from_h5.ipynb```

In [None]:
import pandas as pd
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer as Imputer

df = pd.read_csv('C:/Users/hoerl/Downloads/20210419_glcm_all_extrafeats_selected50intensity.csv')
df['condition'] = [f.split('/')[-3] for f in df.filename]

# only select a subset of conditions
# TODO: do selection after good/bad cls?

# selected_conditions = ['2020705_IMR90_young_untreated', '2020629_IMR90_6d_ICM_young',
#                     '2020625_IMR90_3d_ICM_young', '2020622_IMR90_untreated_old',
#                     '20201214_IMR90_9day', '2020702_IMR90_9d_ICM_young', '20201208_IMR90_3day',
#                       '20210326_IMR90_young_untr', '20210402_IMR90_old']

# selected_conditions = ['2020705_IMR90_young_untreated', '2020629_IMR90_6d_ICM_young',
#                     '2020625_IMR90_3d_ICM_young', '2020622_IMR90_untreated_old',
#                      '2020702_IMR90_9d_ICM_young', '20210326_IMR90_young_untr', '20210402_IMR90_old']

# selected_conditions = ['2020705_IMR90_young_untreated', '2020622_IMR90_untreated_old',]

selected_conditions = ['20210326_IMR90_young_untr', '20210402_IMR90_old','2020705_IMR90_young_untreated', '2020622_IMR90_untreated_old']



# selected_conditions = ['2020622_IMR90_untreated_old', '20201214_IMR90_9day','20201208_IMR90_3day']

# selected_conditions = ['20201214_IMR90_9day','20201208_IMR90_3day']

df = df[df.condition.apply(lambda c: c in selected_conditions)]
df['condition'] = df.condition.apply(lambda c: 'old' if 'old' in c else 'young')

if 'classification_manual' in df.columns:
    feats = df.drop(['filename', 'dataset_name', 'condition', 'classification_manual', 'classification_auto'], 1).values
else:
    feats = df.drop(['filename', 'dataset_name', 'condition'], 1).values

conditions = [f.split('/')[-3] for f in df.filename]

replicates = [f.split('/')[-2] for f in df.filename]

# we have some NaNs, impute them
feats = Imputer().fit_transform(feats)

In [None]:
# np.unique(replicates)
# df.columns

## Optional: sort images into good/bad by features

In [None]:
import json
with open('C:/Users/hoerl/Downloads/sorting20210316.json', 'r') as fd:
    sorting_dict = json.load(fd)
sorting_dict

def get_classification_from_dict(row, sorting_dict):
    filename = os.path.split(row.filename)[1].replace('.h5', '')
    dataset_name = row.dataset_name

    if [filename, dataset_name] in sorting_dict['good']:
        return 'good'
    elif [filename, dataset_name] in sorting_dict['bad']:
        return 'bad'
    else:
        return 'unclassified'

df['classification_manual'] = df.apply(lambda row: get_classification_from_dict(row, sorting_dict), 1)

In [None]:
has_goodbad = df.classification_manual.apply(lambda r: r in ['good', 'bad']).values

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

Xs = StandardScaler().fit_transform(feats)
ys = LabelEncoder().fit_transform(conditions)

Xs_goodbad = Xs[has_goodbad]
le_goodbad = LabelEncoder()
ys_goodbad = le_goodbad.fit_transform(df.classification_manual.values[has_goodbad])
# ys = LabelEncoder().fit_transform([a+b for a,b in zip(conditions,replicates)])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

# model = LogisticRegression(max_iter=1000)
# model = SVC()
# model = AdaBoostClassifier()
# model = RandomForestClassifier()
model = DecisionTreeClassifier()

np.mean(cross_val_score(model, Xs_goodbad, ys_goodbad))
# np.mean(cross_val_score(model, pca.transform(Xs)[:,:20], ys))

In [None]:
model_goodbad = DecisionTreeClassifier()
model_goodbad.fit(Xs_goodbad, ys_goodbad)
df['classification_auto'] = le_goodbad.inverse_transform(model_goodbad.predict(Xs))

In [None]:
# look at the number of good/bad images per condition
df.groupby('condition').classification_auto.describe()

In [None]:
df.groupby('condition').perc_low.describe()

In [None]:
# save table with classification (e.g. for plotting of good/bad images)
# df.to_csv('/scratch/hoerl/auto_sir_dna_comp/20210316_glcm_all_plusolddata_extrafeats_with_classification.csv')

## CV classifier on images

In [None]:
feats_to_drop = ['img_height', 'img_width',# 'intensity_mu', # maybe confounding
                 'intensity_sigma',# 'mask_area', 'perc_high', 'perc_low',
                 'classification_manual', 'classification_auto', # classification-related
                 'filename', 'dataset_name', 'condition' ]

feats = df.drop(feats_to_drop, 1).values
feats = Imputer().fit_transform(feats)

Xs = StandardScaler().fit_transform(feats[df.classification_auto == 'good'])
ys = LabelEncoder().fit_transform(df.condition)[df.classification_auto == 'good']

# from sklearn.preprocessing import PolynomialFeatures
# Xs_p = PolynomialFeatures(2).fit_transform(Xs) # NOTE: using this is super slow -> PCA before cls?

In [None]:
df.drop(feats_to_drop, 1).columns

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(Xs)
plt.plot(pca.explained_variance_ratio_)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.model_selection import cross_val_score

model = LogisticRegression(max_iter=1000)
# model = SVC()
# model = AdaBoostClassifier()
# model = RandomForestClassifier()

np.mean(cross_val_score(model, Xs, ys))
# np.mean(cross_val_score(model, pca.transform(Xs)[:,:100], ys))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2),
              scoring='accuracy',
              min_features_to_select=min_features_to_select)
rfecv.fit(Xs, ys)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
feats = df.drop(feats_to_drop, 1).values[:,rfecv.support_]
Xs = StandardScaler().fit_transform(feats)

# tSNE visualization

In [None]:
# also select only good from condition labels
# only necessary for plot
conditions = [f.split('/')[-3] for f in df[df.classification_auto == 'good'].filename]
replicates = [f.split('/')[-2] for f in df[df.classification_auto == 'good'].filename]
conditions = df.condition
np.unique(replicates)

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA, FactorAnalysis
import seaborn as sns

tsne = TSNE(perplexity=30)

# p = tsne.fit_transform(pca.transform(Xs)[:,:20])
p = tsne.fit_transform(Xs)

plt.figure(figsize=(12,12))
sns.scatterplot(x=p.T[0], y=p.T[1], hue=conditions, alpha=1, s=60, palette=sns.color_palette('husl', len(np.unique(conditions))))

In [None]:
plt.figure(figsize=(12,12))
sns.scatterplot(x=p.T[0], y=p.T[1], hue=conditions, style=replicates, s=60, palette=sns.color_palette('hls', len(np.unique(conditions))))

## ROC curve

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve

X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.2)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

fpr, tpr, thres = roc_curve(y_test, model.decision_function(X_test))
plt.plot(fpr, tpr)
auc(fpr, tpr)

# np.round(model.predict_proba(X_test), 3)