In [None]:
import os, sys, re, io, pathlib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import cudf
# import cuml
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from sklearn.preprocessing import normalize

# idx = pd.IndexSlice
buffer = io.StringIO()
# idx = pd.IndexSlice

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics(material_info = True)

def ix_before_and_after(ds, index = ['tag', 'subtag'], columns = 'experiment', values = 'ix', mask = None, subset = None, aggfunc = lambda x: [*x],
                        dropna_tresh = 2, chain = True):
    ds.loc[:, 'ix'] = ds.index.values
    if mask is not None:
        ds = ds.loc[mask, :]
    dx = ds.pivot_table(index = index, 
                        columns = columns, 
                        values = values, 
                        aggfunc = aggfunc)
    if subset is not None:
        dx = dx.loc[:, subset]
    dx.dropna(thresh = dropna_tresh, inplace = True)

    # dp.loc[idx[dx.index[1], :], 'ix']

    # get the index of samples with before and after characterization
    if chain:
        ix = list(itertools.chain(*dx.apply(lambda x: [*itertools.chain(*[s for s in x if type(s) != float])], axis = 1).values))
    else:
        ix = dx
    return ix

def set_spe_style(ax, title = '', xlabel = '', ylabel=''):
    plt.sca(ax)
    plt.title(title, fontweight = 'bold');
    plt.xlabel(xlabel, fontweight = 'bold')
    plt.ylabel(ylabel, fontweight = 'bold');
    plt.xticks(fontweight = 'bold');
    plt.yticks(fontweight = 'bold');
    return ax

rc_dict = {
    "font.size":12,
    'font.weight':'bold',
    "axes.titlesize":12,
    "axes.titleweight":'bold',
    "axes.labelsize":12,
    'axes.labelweight':'bold',
    'xtick.labelsize':12,
    'ytick.labelsize':12,
    'legend.frameon':True,
    'legend.fontsize':12,
    'legend.title_fontsize':12,
}

sns.set_context("paper", 
                rc = rc_dict)

sns.set(rc = rc_dict)

sns.set_style('darkgrid')

In [None]:
# define paths
datapath = '/sandbox/data/autoscan/'
vispath = '/sandbox/vis/autoscan/'
savepath = datapath
datafname = 'autoscan_corrected.h5'

datafile = os.path.join(datapath, datafname)
figspath = os.path.join(vispath, 'rock_multiphysics_display')

In [None]:
for p in [figspath, savepath]:
    if not os.path.exists(p):
        os.makedirs(p)

In [None]:
# read the data
dd = pd.read_hdf(datafile, key = 'data')
ds = pd.read_hdf(datafile, key = 'desc')
df = ds.join(dd)
del dd, ds

# df.columns = list(df.columns[:10].values) + list(pp.ftir_lambdas) + list(df.columns[-23:].values)
# set the value limits for perm
for k,v in {'sandstone':1e3, "shale":1e2, 'carbonate':1e3}.items():
    ix = df.loc[(df.family == k) & (df.instance == 'before'), 'perm'] > v
    fill_val = df.loc[ix.index[ix == False], 'perm'].mean()
    df.loc[ix.index[ix == True], 'perm'] = fill_val

df = df.query("family != 'metal' & family != 'gemstones'").copy()
df = df.sort_values(by ='instance', ascending = False, ignore_index = True)

In [None]:
# normalize(df.loc[df.l_1.isna() == False, pp.ftir_cols], norm = 'max')
df_temp = df.dropna(subset = pp.ftir_cols).copy()
df_temp.loc[:, pp.ftir_cols] = normalize(df_temp.loc[:, pp.ftir_cols], norm = 'max')
ix = df_temp.index
df.loc[ix, pp.ftir_cols] = df_temp.loc[:, pp.ftir_cols].values
del df_temp, ix

In [None]:

# df_ftir_bna = df.loc[ix, pp.meta_cols + pp.grid_cols + pp.ftir_cols]

In [None]:
df_ftir = df.loc[:, ['family', 'tag','instance', 'l_max_peak'] + pp.ftir_cols].dropna()
newcolumns = list(df_ftir.columns[:4].values) + list(pp.ftir_lambdas)
df_ftir.columns = newcolumns
df_ftir.reset_index(drop = False, inplace = True)
# df_ftir.iloc[:, 4:] = normalize(df_ftir.iloc[:, 4:])
df_ftir.head()
# df_ftir.iloc[:, 2:] = normalize(df_ftir.iloc[:, 2:], norm = 'max')

In [None]:
print("number of samples: %d, number of features: %d" %(df_ftir.shape))
# gdf_ftir.head()

In [None]:
nclusters = 10
kmeans = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans.fit(df_ftir.iloc[:, 5:])
df_ftir.loc[:, 'cluster'] = kmeans.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'family', y = 'cluster', hue = 'instance', dodge = True, data = df_ftir, ax = ax)
plt.title('Distribution of clusters per rock family before and after')
plt.savefig(os.path.join(figspath, 'all-rocks_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
# fig, ax = plt.subplots(figsize = (24, 12))
# sns.kdeplot(x = 'l_max_peak', y = 'cluster', hue = 'family', shade = True, alpha = 0.5, data = df_ftir.query("instance == 'before'"), ax = ax, 
#             palette = 'deep')

In [None]:
# fig, ax = plt.subplots(figsize = (24, 12))
# sns.kdeplot(x = 'l_max_peak', y = 'cluster', hue = 'family', shade = True, alpha = 0.5, data = df_ftir.query("instance == 'after'"), ax = ax, 
#             palette = 'deep')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_ftir, split = True, orient = 'v', inner = 'quartile', scale = 'area', ax = ax)
plt.yticks(range(6), range(6))
plt.title('Cluter distribution pero rock family before and after ')
plt.savefig(os.path.join(figspath, 'all-rocks_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
left_cols = df_ftir.columns[:5].to_list() + ['cluster']

In [None]:
# for p in [5, 10, 15, 30, 50]:
#     tsne = TSNE(n_components = 2, n_iter = 50000, angle = 0.8, n_neighbors = int(4 * p), perplexity = p, random_state = 1)
    
#     x_hat = tsne.fit_transform(df_ftir.iloc[:, 4:-1])
#     x_hat = df_ftir.loc[:, left_cols].join(pd.DataFrame(x_hat, columns = ['u', 'v']))
    
#     fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
#     sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', style = 'family', data = x_hat.query("instance == 'before'"), ax = ax[0])
#     sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', style = 'family', data = x_hat.query("instance == 'after' "), ax = ax[1])
#     ax[0].set_title('before')
#     ax[1].set_title('after')
#     fig.suptitle('tSNE before & after for shale, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold')
perplexity = p = 50
neighbors = int(3 * p)
print('neighbors = %d' % (neighbors))
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10)

X_hat = tsne.fit_transform(df_ftir.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_ftir.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)

fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for all samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'all-rocks_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
mask = df.loc[:, pp.ftir_cols].isna().any(axis = 1) == False
ix = ix_before_and_after(df.loc[:, pp.meta_cols], mask = mask, subset = ['before', 'heat_treatment', 'perf'])
df_ftir_bna = df_ftir.set_index('index').loc[ix, :].copy()
df_ftir_bna.reset_index(drop = False, inplace = True)

In [None]:
nclusters = 10
kmeans_bna = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_bna.fit(df_ftir_bna.iloc[:, 5:-1])
df_ftir_bna.loc[:, 'cluster'] = kmeans_bna.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'family', y = 'cluster', hue = 'instance', dodge = True, data = df_ftir_bna, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Distribution of clusters per rock family before and after')
plt.savefig(os.path.join(figspath, 'all-rocks-bna_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_ftir_bna, split = True, orient = 'v', inner = 'quartile', scale = 'area', ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluter distribution pero rock family before and after ')
plt.savefig(os.path.join(figspath, 'all-rocks-bna_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 30
neighbors = int(5 * p)
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')

X_hat = tsne.fit_transform(df_ftir_bna.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_ftir_bna.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)
fig, ax = plt.subplots(ncols = 2, figsize = (24, 12.1), sharey = True)
plt.subplots_adjust(wspace = 0.05)

sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for all samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'all-rocks-bna_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
df_temp = df_ftir.query("tag == 'sh_001'").copy()
df_temp.reset_index(drop = True, inplace = True)

In [None]:
nclusters = 4
kmeans_sh = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_sh.fit(df_temp.iloc[:, 5:-1])
df_temp.loc[:, 'cluster'] = kmeans_sh.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'instance', y = 'cluster', hue = 'cluster', dodge = False, data = df_temp, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluter distribution before and after for shale sample sh_1')
plt.savefig(os.path.join(figspath, 'sh-1_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_temp, split = True, orient = 'v', inner = 'quartile', scale = 'count', ax = ax)
plt.xlabel('')
plt.xticks([0], [''])
plt.yticks([0, 1, 2, 3], [0, 1, 2, 3])
plt.title('Cluter distribution before and after for shale sample sh_1')
plt.savefig(os.path.join(figspath, 'sh-1_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 50
neighbors = int(3 * p)
print('neighbors = %d' % (neighbors))
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')

X_hat = tsne.fit_transform(df_temp.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_temp.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)

fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for shale sample sh_1, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'sh-1_tsne.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
df_temp = df_ftir.query("family == 'shale'").copy()
df_temp.reset_index(drop = True, inplace = True)

In [None]:
nclusters = 10
kmeans_sh = KMeans(n_clusters = nclusters, max_iter = 10000, init = 'scalable-k-means++', n_init = 10)
kmeans_sh.fit(df_temp.iloc[:, 5:-1])
df_temp.loc[:, 'cluster'] = kmeans_sh.labels_

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(x = 'instance', y = 'cluster', hue = 'cluster', dodge = False, data = df_temp, ax = ax)
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluster distribution before and after for shale samples')
plt.savefig(os.path.join(figspath, 'shale_kmeans_stripplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'cluster', hue = 'instance', data = df_temp, split = True, orient = 'v', inner = 'quartile', scale = 'count', ax = ax)
plt.xlabel('')
plt.xticks([0], [''])
plt.yticks(range(nclusters), range(nclusters))
plt.title('Cluster distribution before and after for shale samples')
plt.savefig(os.path.join(figspath, 'shale_kmeans_violinplot.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
perplexity = p = 50
neighbors = int(3 * p)
tsne = TSNE(n_components = 2, n_iter = 10000, angle = 0.8, n_neighbors = neighbors, perplexity = p, random_state = 5, learning_rate = 10, method = 'exact')
X_hat = tsne.fit_transform(df_temp.iloc[:, 5:-1].values)

In [None]:
x_hat = pd.concat([df_temp.loc[:, left_cols], pd.DataFrame(X_hat, columns = ['u', 'v'])], axis = 1)

fig, ax = plt.subplots(ncols = 2, figsize = (24, 13), sharey = True)
plt.subplots_adjust(wspace = 0.05)
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'before'"), ax = ax[0], palette = 'viridis', legend = 'full')
sns.scatterplot(x = 'u', y = 'v', hue = 'cluster', data = x_hat.query("instance == 'after' "), ax = ax[1], palette = 'viridis', legend = 'full')
ax[0].set_title('before')
ax[1].set_title('after')
fig.suptitle('tSNE before & after for shale samples, perplexity = ' + str(p), fontsize = 12, fontweight = 'bold', y = 0.92)
plt.savefig(os.path.join(figspath, 'shale_tsne.png'), dpi = 300, bbox_inches = 'tight')