# generate subset of DNA texture files

In [None]:
# copy msrs

r = '/data/cooperation_data/Preliminary_projects/AgyrisPapantonis_ChromatinTexture_AgeingCells/auto_sir_ageing-cells/*'
root_dir = '/data/cooperation_data/Preliminary_projects/AgyrisPapantonis_ChromatinTexture_AgeingCells/auto_sir_ageing-cells/*'
out = '/scratch/hoerl/dna_sir_ageing/examples'

import random
import shutil
import os
import glob

for ri in glob.glob(r):
    
    outi = out + '/' + ri.rsplit('/')[-1]
    
    rii = glob.glob(ri + '/*/*detail*')
    rii = random.sample(rii, 10)
    
#     print(rii)
#     os.makedirs(outi, exist_ok=True)
#     for riii in rii:
#         shutil.copy2(riii, outi)

In [None]:
def make_scaled_gray_colormap(_min, _max):
    '''
    FIXME: colormap as expected by tifffile imsave?
    Does not work in macOS Preview though...
    '''
    cm = np.full((3, 2**16), 2**8-1, dtype=np.uint16)
    cm[:,:_min] = 0
    cm[:, _min:_max] = np.linspace(0, 2**8-1, _max - _min)
    return cm

In [None]:
import json
import warnings

# load manual sorting
try:
    with open('/scratch/hoerl/dna_sir_ageing/examples_tiff/20201006sorting.json', 'r') as fd:
        sorting = json.load(fd)
except FileNotFoundError:
    warnings.warn('No manual sorting found, classification will not work')
    sorting = {'good': [], 'bad': []}

In [None]:
import pathlib2
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

import numpy as np
import h5py as h5
from skimage.filters import threshold_otsu
import random
import shutil
import os

import matplotlib.pyplot as plt
from skimage.io.tifffile import imsave, TiffWriter, TiffFile
from skimage.transform import resize
from skimage.exposure import rescale_intensity
from itertools import repeat
import pandas as pd

good_imgs = []
bad_imgs = []

size = (32,32)

def safe_otsu(img):
    try:
        return threshold_otsu(img)
    except ValueError:
        return 0

def get_features(img, size):
    img_rescaled = resize(img, size, order=1, mode='reflect', anti_aliasing=True, clip=False, preserve_range=True)
    
    img_size = img.size
    black_size = np.sum(img==0)
    
    fg_area = np.sum(img_rescaled >= safe_otsu(img_rescaled))
    fg_mean = np.mean(img_rescaled[img_rescaled >= safe_otsu(img_rescaled)])
    bg_mean = np.mean(img_rescaled[img_rescaled < safe_otsu(img_rescaled)])
    fg_mean = 0 if np.isnan(fg_mean) else fg_mean
    bg_mean = 0 if np.isnan(bg_mean) else bg_mean
    
    return img_size, black_size, fg_area, fg_mean, bg_mean

In [None]:
res = defaultdict(list)
for class_dir in glob.glob(root_dir):
    
    # get h5 files one or two levels down
    h5s = glob.glob(class_dir + '/*/*.h5') + glob.glob(class_dir + '/*/*/*.h5')

    for h5i in h5s:
        with h5.File(h5i, 'r') as fd:
            
            # get all detail images
            details = [l for l in list(fd['experiment'].keys()) if 'detail' in l]
            
            with ThreadPoolExecutor() as tpe:
                futures = []
                for d in details:
                    dat = fd['experiment'][d]['0']['0']
                    img = dat[...].squeeze()
                    futures.append(tpe.submit(get_features, img, size))
                
                for d,f in zip(details, futures):
                    img_size, black_size, fg_area, fg_mean, bg_mean = f.result()
                    
                    if '_'.join([pathlib2.Path(h5i).name[:-3], d]) in sorting['good']:
                        classification = 'good'
                        good_imgs.append(fd['experiment'][d]['0']['0'][...].squeeze())
                        
                    elif '_'.join([pathlib2.Path(h5i).name[:-3], d]) in sorting['bad']:
                        classification = 'bad'
                        bad_imgs.append(fd['experiment'][d]['0']['0'][...].squeeze())
                    else:
                        classification = 'unknown'
                        
                    res['h5path'].append(h5i)
                    res['name'].append(d)
                    res['classification'].append(classification)
                    res['img_size'].append(img_size)
                    res['black_size'].append(black_size)
                    res['fg_area'].append(fg_area)
                    res['fg_mean'].append(fg_mean)
                    res['bg_mean'].append(bg_mean)

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

df = pd.DataFrame.from_dict(res)

# get manually annotated subset
df_for_classification = df[df.classification.apply(lambda c: c in ['good', 'bad'])]
# xs = df_for_classification[['bg_mean', 'fg_mean', 'fg_area', 'black_size', 'img_size']].values
xs = df_for_classification[['black_size', 'img_size', 'fg_area']].values
ys = (df_for_classification.classification == 'good').values

# fit tree
cls = DecisionTreeClassifier()
cls.fit(xs, ys)

# predict for all
# xs_all = df[['bg_mean', 'fg_mean', 'fg_area', 'black_size', 'img_size']].values
xs_all = df[['black_size', 'img_size', 'fg_area']].values
class_predicted = ['good' if yp else 'bad' for yp in cls.predict(xs_all)]
df['classification_predicted'] = class_predicted

cell_class = df.h5path.apply(lambda p: p.split(os.sep)[-3] if not 'test' in p else p.split(os.sep)[-4])
replicate = df.h5path.apply(lambda p: p.split(os.sep)[-2].split('_')[-1] if not 'test' in p else p.split(os.sep)[-3].split('_')[-1])


df['cell_class'] = cell_class
df['replicate'] = replicate
df['fg_mean_sub_bg'] = df.fg_mean - df.bg_mean

df_good = df[df.classification_predicted == 'good']
# to skip classificiation
df_good = df

In [None]:
df.groupby('cell_class')['classification_predicted'].describe()
# df.columns

In [None]:
df.to_csv('/scratch/hoerl/dna_sir_ageing/20210430summary_withnew_samples.csv')

In [None]:
import pandas as pd
df = pd.read_csv('/scratch/hoerl/dna_sir_ageing/20210430summary_withnew_samples.csv')
df_good = df[df.classification_predicted == 'good']
df_good.head()

In [None]:
df_good.groupby(['cell_class', 'replicate']).describe()

In [None]:
xlim_for_plot = (0, 750)
bins_for_plot = 50
vline_location = 50

plt.figure(figsize=(10,10))
axs = df_good.hist('fg_mean_sub_bg', by=['cell_class',], ax=plt.gca(), sharex=True, bins=30, density=True)
for ax in axs.flat:
    ax.set_xlim(xlim_for_plot)
    ax.axvline(vline_location, color='red')
    
    
fig, axs = plt.subplots(4, 2, sharex=True, figsize=(10,10))
for (i, dfi), ax in zip(df_good.groupby('cell_class'), axs.flat):
    for (rep, dfj) in dfi.groupby('replicate'):
        ax.hist(dfj.fg_mean_sub_bg.values, density=True, alpha=0.5,
                bins=np.linspace(*xlim_for_plot, bins_for_plot), label=rep)
    ax.set_title(i)
    ax.set_xlim(xlim_for_plot)
axs[0,0].legend()
axs[-1,-1].set_visible(False)


fig, axs = plt.subplots(4, 2, sharex=True, figsize=(10,10))
for (i, dfi), ax in zip(df_good.groupby('cell_class'), axs.flat):
    for (rep, dfj) in dfi.groupby('replicate'):
        ax.hist(dfj.fg_mean_sub_bg.values, density=True, alpha=0.5,
                bins=np.linspace(*xlim_for_plot, bins_for_plot), label=rep)
        # hist outlines - does not look as nice imho
#         h, bins = np.histogram(dfj.fg_mean_sub_bg.values, density=True, bins=np.linspace(0,250))
#         ax.plot((bins[1:] + bins[:-1])/2, h,label=rep)
    ax.set_title(i)
    ax.set_xlim(xlim_for_plot)
    ax.axvline(vline_location, color='red')
axs[0,0].legend()
axs[-1,-1].set_visible(False)

In [None]:
df_good.groupby(['cell_class',]).describe()

In [None]:
target_intensity = 50
n_best = 250

#classes = ['2020622_IMR90_untreated_old', '2020705_IMR90_young_untreated']
# classes = ['2020622_IMR90_untreated_old', '2020705_IMR90_young_untreated', '2020625_IMR90_3d_ICM_young', '2020629_IMR90_6d_ICM_young', '2020702_IMR90_9d_ICM_young']
# classes = ['20201208_IMR90_3day', '20201214_IMR90_9day', '2020622_IMR90_untreated_old']
classes = ['20210326_IMR90_young_untr', '20210402_IMR90_old']
df_good['diff'] = np.abs(df_good.fg_mean_sub_bg - target_intensity)

dfs = {}

for ci, dfi in df_good.groupby('cell_class'):
    if ci in classes:
        df_best = dfi.sort_values('diff').reset_index(drop=True).loc[:(n_best-1), :]
        dfs[ci] = df_best
        
#dfs
[len(v) for v in dfs.values()], [(ci, len(dfi)) for ci, dfi in df_good.groupby('cell_class') if ci in classes]

In [None]:
from operator import add
from functools import reduce
from scipy.optimize import leastsq


classes = df_good.cell_class.unique()
n_best = 75

def get_sum_diff(target_intensity, df_good):
    dfs = {}
    df_good = df_good.copy()
    df_good['diff'] = np.abs(df_good.fg_mean_sub_bg - target_intensity)
    for (ci, ri), dfi in df_good.groupby(['cell_class', 'replicate']):
#         if ci in classes:
            df_best = dfi.sort_values('diff').reset_index(drop=True).loc[:(n_best-1), :]
            dfs[(ci, ri)] = df_best

    sum_dev = reduce(add, [np.sum(v['diff']) for k, v in dfs.items()])
    return sum_dev, dfs

f = lambda ti : get_sum_diff(ti, df_good)[0]

optimal_ti, _ = leastsq(f, 50)
_, dfs = get_sum_diff(optimal_ti, df_good)
optimal_ti

In [None]:
for k, v in dfs.items():
    print(len(v))

In [None]:
import warnings

for cell_type, dfi in dfs.items():
    
    out = '/scratch/hoerl/dna_sir_ageing/examples_tiff_n_200_optimal_intensity_16bit/{}'.format(cell_type)
    os.makedirs(out, exist_ok=True)
    
    for h5file, name, rep in zip(dfi.h5path, dfi.name, dfi.replicate):
        #print(rep)
        
        with h5.File(h5file, 'r') as fd:
            dat = fd['experiment'][name]['0']['0']
            img = dat[...].astype(np.uint16)
#             img_uint8 = rescale_intensity(img, out_range='uint8')
#             img_uint8 = img_uint8.astype(np.uint8)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                imsave(os.path.join(out, h5file.rsplit(os.sep)[-1][:-3] + '_' + rep + '_' + name + '.tif'), img, compress=5)

In [None]:
import warnings
from skimage.exposure import rescale_intensity

for (cell_type, rep), dfi in dfs.items():
    
    out = '/scratch/hoerl/dna_sir_ageing/examples_tiff_n_200_optimal_intensity_8bit_repl/{}'.format(cell_type + '_' + rep)
    os.makedirs(out, exist_ok=True)
    
    for h5file, name, rep in zip(dfi.h5path, dfi.name, dfi.replicate):
        #print(rep)
        
        with h5.File(h5file, 'r') as fd:
            dat = fd['experiment'][name]['0']['0']
            img = dat[...].astype(np.uint16)
            img_uint8 = rescale_intensity(img, in_range=tuple(np.quantile(img, [0.01, 0.995])), out_range='uint8')
            img_uint8 = img_uint8.astype(np.uint8)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                imsave(os.path.join(out, h5file.rsplit(os.sep)[-1][:-3] + '_' + rep + '_' + name + '.tif'), img_uint8, compress=5)

## QC: plot a few good/bad images for each cell class

In [None]:
samples_to_plot = 5

for ci, dfi in df.groupby('cell_class'):
    good_i = dfi[dfi.classification_predicted == 'good']
    bad_i = dfi[dfi.classification_predicted == 'bad']
    print(ci, len(good_i), len(bad_i))
    
    good_s = good_i.sample(samples_to_plot)
    bad_s = bad_i.sample(samples_to_plot)
    
    for h5file, name, rep in zip(good_s.h5path, good_s.name, good_s.replicate):
        with h5.File(h5file, 'r') as fd:
            dat = fd['experiment'][name]['0']['0']
            img = dat[...]
            plt.figure()
            plt.imshow(img.squeeze(), cmap='gray')
            plt.show()
            
    for h5file, name, rep in zip(bad_s.h5path, bad_s.name, bad_s.replicate):
        with h5.File(h5file, 'r') as fd:
            dat = fd['experiment'][name]['0']['0']
            img = dat[...]
            plt.figure()
            plt.imshow(img.squeeze(), cmap='magma')
            plt.show()

In [None]:

sample_size = 8
size = (32,32)
for g in random.sample(good_imgs, sample_size):
    plt.figure()
    g = resize(g, size, order=1, mode='reflect', anti_aliasing=True, clip=False, preserve_range=True)
    #print(np.mean(g[g > threshold_otsu(g)]), np.mean(g[g < threshold_otsu(g)]))
    plt.imshow(g > threshold_otsu(g), cmap='Greens')
print()
for b in random.sample(bad_imgs, sample_size):
    plt.figure()
    b = resize(b, size, order=1, mode='reflect', anti_aliasing=True, clip=False, preserve_range=True)
    #print(np.mean(b[b > threshold_otsu(b)]), np.mean(b[b < threshold_otsu(b)]))
    plt.imshow(b > threshold_otsu(b), cmap='Reds')

In [None]:
size = (32,32)

good_feats = np.array([resize(g, size, order=1, mode='reflect', anti_aliasing=True, clip=False, preserve_range=True) for g in good_imgs]).reshape((len(good_imgs), -1))
bad_feats = np.array([resize(b, size, order=1, mode='reflect', anti_aliasing=True, clip=False, preserve_range=True) for b in bad_imgs]).reshape((len(bad_imgs), -1))

In [None]:


xs = np.concatenate([good_feats, bad_feats])
ys = np.concatenate([np.full((len(good_imgs),), 1), np.full((len(bad_imgs),), 0)])

img_sizes = np.concatenate([np.array([[i.size] for i in good_imgs]), np.array([[i.size] for i in bad_imgs])])
black_sizes = np.concatenate([np.array([[np.sum(i==0)] for i in good_imgs]), np.array([[np.sum(i==0)] for i in bad_imgs])])

fg_area = np.array([[np.sum(i >= safe_otsu(i))] for i in xs])
fg_mean = np.array([[np.mean(i[i >= safe_otsu(i)])] for i in xs])
bg_mean = np.array([[np.mean(i[i < safe_otsu(i)])] for i in xs])

fg_mean[np.isnan(fg_mean)] = 0
bg_mean[np.isnan(bg_mean)] = 0





pca = PCA()
pca.fit(xs)

xs = pca.transform(xs)[:,:16]

xs = np.concatenate([ fg_mean, bg_mean, fg_area, img_sizes, black_sizes], 1)


cls = RandomForestClassifier()
cross_val_score(cls, xs, ys)

#(fg_mean - bg_mean).astype(np.int)

In [None]:
root_dir = '/data/cooperation_data/Preliminary_projects/AgyrisPapantonis_ChromatinTexture_AgeingCells/auto_sir_ageing-cells/*'
out = '/scratch/hoerl/dna_sir_ageing/examples_tiff'

import random
import shutil
import os

import matplotlib.pyplot as plt
from skimage.external.tifffile import imsave, TiffWriter, TiffFile
from skimage.transform import resize
from skimage.exposure import rescale_intensity
from itertools import repeat


n_images_per_class = 2
#cm = make_scaled_gray_colormap(0, 300)

for ri in glob.glob(root_dir):
    
    outi = out + '/' + ri.rsplit('/')[-1]
    h5s = glob.glob(ri + '/*/*.h5') + glob.glob(ri + '/*/*/*.h5')
    
    # get all (h5-file, image-name) pairs
    selection = []
    for h5i in h5s:
        with h5.File(h5i, 'r') as fd:
            details = [l for l in list(fd['experiment'].keys()) if 'detail' in l]
            selection.extend(zip(repeat(h5i), details))
            
    selection = random.sample(selection, n_images_per_class)
    os.makedirs(outi, exist_ok=True)
    
    for h5i, name in selection:
        with h5.File(h5i, 'r') as fd:
            dat = fd['experiment'][name]['0']['0']
            img = dat[...].astype(np.uint16)

            plt.figure()
            plt.imshow(resize(img.squeeze(), (32,32), order=1, clip=False))
            
            # rescaled from image range to uint8, otherwise e.g. Texture features in CellProfiler often
            # return 1/0?
            img_uint8 = rescale_intensity(img, out_range='uint8')
            img_uint8 = img_uint8.astype(np.uint8)
            #imsave(os.path.join(outi, h5i.rsplit(os.sep)[-1][:-3] + '_' + name + '.tif'), img)
            #imsave(os.path.join(outi, h5i.rsplit(os.sep)[-1][:-3] + '_' + name + '8bit.tif'), img_uint8)