In [None]:
# preprocessing with dask
import os, sys, re, io, pathlib
import pandas as pd
import hiplot as hip
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools

idx = pd.IndexSlice
buffer = io.StringIO()
idx = pd.IndexSlice

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics(material_info = True)

def hip_visualize(df, pcols = None, index = ['family', 'code']):
    dp = df.reset_index().loc[:, np.append(index, pcols)]
    s = hip.Experiment.from_dataframe(dp)
    s.colormap = 'interpolateViridis'
    s.display()
    return s

def ix_before_and_after(ds, index = ['tag', 'subtag'], columns = 'experiment', values = 'ix', mask = None, subset = None):
    ds.loc[:, 'ix'] = ds.index.values
    if mask is not None:
        ds = ds.loc[mask, :]
    dx = ds.pivot_table(index = index, 
                        columns = columns, 
                        values = values, 
                        aggfunc = lambda x: [*x])
    if subset is not None:
        dx = dx.loc[:, subset]
    dx.dropna(thresh = 2, inplace = True)

    # dp.loc[idx[dx.index[1], :], 'ix']

    # get the index of samples with before and after characterization 
    ix = list(itertools.chain(*dx.apply(lambda x: [*itertools.chain(*[s for s in x if type(s) != float])], axis = 1).values))
    return ix

def set_spe_style(ax, title = '', xlabel = '', ylabel=''):
    plt.sca(ax)
    plt.title(title, fontweight = 'bold');
    plt.xlabel(xlabel, fontweight = 'bold')
    plt.ylabel(ylabel, fontweight = 'bold');
    plt.xticks(fontweight = 'bold');
    plt.yticks(fontweight = 'bold');
    return ax

rc_dict = {
    "font.size":12,
    'font.weight':'bold',
    "axes.titlesize":12,
    "axes.titleweight":'bold',
    "axes.labelsize":12,
    'axes.labelweight':'bold',
    'xtick.labelsize':12,
#     'xtick.labelweight':'bold',
    'ytick.labelsize':12,
#     'ytick.labelweight':'bold',
    'legend.frameon':True,
    'legend.fontsize':12,
    'legend.title_fontsize':12,
#     'legend.fontweight':'bold',
#     'legeld.title_fontweight':'bold',
#     'title.fontsize':12
}

sns.set_context("paper", 
                rc = rc_dict)

sns.set(rc = rc_dict)

sns.set_style('darkgrid')

In [None]:
# define paths
datapath = '/sandbox/data/autoscan/'
vispath = '/sandbox/vis/autoscan/'
savepath = datapath
datafname = 'autoscan_corrected.h5'

datafile = os.path.join(datapath, datafname)
figspath = os.path.join(vispath, 'rock_multiphysics_display')

In [None]:
# read the data
dd = pd.read_hdf(datafile, key = 'data')
ds = pd.read_hdf(datafile, key = 'desc')
df = ds.join(dd)

In [None]:
for k,v in {'sandstone':1e3, "shale":1e2, 'carbonate':1e3}.items():
    ix = df.loc[(df.family == k) & (df.instance == 'before'), 'perm'] > v
    fill_val = df.loc[ix.index[ix == False], 'perm'].mean()
    df.loc[ix.index[ix == True], 'perm'] = fill_val

In [None]:
df_rocks = df.query("family != 'metal' & family != 'gemstones'")
df_rocks = df_rocks.sort_values(by = 'instance', ascending = False)

In [None]:
# define important columns (categorical and numerical)
col_numerical = pp.grid_cols + pp.probe_cols + ['l_max_peak']
col_meassmall = subset_cols = ['l_max_peak', 'perm'] + pp.mech_cols[::2] + ['e_star']

In [None]:
# separate the values in another dataframe
ix = ix_before_and_after(ds, index = 'code', subset = ['before', 'heat_treatment', 'perf'])
df_bna = df.loc[ix, :]

In [None]:
mask = df.perm.isna() == False
ix = ix_before_and_after(ds, mask = mask, subset = ['before', 'heat_treatment', 'perf'])
df_perm_bna = df.loc[ix, pp.meta_cols + pp.grid_cols + ['perm']]

In [None]:
mask = df.loc[:, pp.mech_cols[::2]].isna().any(axis = 1) == False
ix = ix_before_and_after(ds, mask = mask, subset = ['before', 'heat_treatment'])
df_mech_bna = df.loc[ix, pp.meta_cols + pp.grid_cols + pp.mech_cols + pp.vel_cols]

In [None]:
mask = df.loc[:, pp.ftir_cols].isna().any(axis = 1) == False
ix = ix_before_and_after(ds, mask = mask, subset = ['before', 'heat_treatment', 'perf'])
df_ftir_bna = df.loc[ix, pp.meta_cols + pp.grid_cols + pp.ftir_cols + ['l_max_peak']]

# visualization
1. hip-plot (again) but with corrected data
2. distributions

In [None]:
# set pointers to before and after datasets
df_before = df.query("instance == 'before'")
df_after  = df.query("instance == 'after'")

## hiplot: hierarchical data relations

In [None]:
# get the subset_cols
print('subset cols for hip: %s' %(subset_cols))

### ftir, perm, mech, e_star

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-perm-and-mechx0-estar.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-perm-and-mechx0-estar.html'));

### ftir, perm, and mech

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols[:-1]), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-perm-and-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols[:-1]), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-perm-and-mechx0.html'));

### perm and mech

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols[1:-1]), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_permvel-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols[1:-1]), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_permvel-mechx0.html'));

### ftir and mech

In [None]:
sc = [subset_cols[0]] +  subset_cols[2:]
s = hip_visualize(df_before.dropna(subset = sc), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = sc), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-mechx0.html'));

In [None]:
groupby_cols =['code', 'instance']
probes = ['perm']
subset_cols = groupby_cols + probes

In [None]:
# # df_bna.loc[:, subset_cols].plot(kind = 'kde')
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.kdeplot(x = 'perm',  hue = 'code', data = df_bna, clip = [0, 500], vertical = True,
#             palette = 'viridis', shade = 'fill', ax = ax)
# sns.set_style('darkgrid')
# plt.title('permeability before');

In [None]:
# ix_perm = df_before.perm.isna() == False
# df_perm_before = df_before.loc[ix_perm, ['family', 'code', 'perm']]

## distributions

### permeability

In [None]:
# create a temporary store for permeability data and calculate log
df_temp = df_rocks.loc[:, ['family', 'instance', 'perm']]
df_temp.loc[:, 'log_perm'] = df_temp.perm.apply(np.log10)

#### scatter

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(y = 'perm', x = 'family', hue = 'instance', dodge = True, data = df_rocks, 
              palette = 'viridis', ax = ax)
plt.yscale('log')
ax = set_spe_style(ax, title = 'Permeability before & after', xlabel = '', ylabel = 'Permeability (mD)')

#### boxplot

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.boxplot(x = 'family', y = 'perm', hue = 'instance', data = df_temp, dodge = True, width = 0.5, palette = 'pastel')
plt.yscale('log')
ax = set_spe_style(ax, title = 'Permeability before & after', xlabel = '', ylabel = 'Permeability (mD)')

#### violin

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'log_perm', hue = 'instance', split = True, data = df_temp, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'Permeability distibution before & after', xlabel = '', ylabel = 'Log$_{10}$ of Permeability (mD)')
# plt.yscale('log')

### vels

In [None]:
# df_temp = df_rocks.loc[:, ['family', 'instance'] +  ]
# df_temp.loc[:, 'log_perm'] = df_temp.perm.apply(np.log10)
markers = {"shale": "o", "sandstone": "v", "carbonate":"s"}

#### pairplot: vels 

In [None]:
g = sns.pairplot(df_rocks.query("instance == 'before'").loc[:, ['family', 'code'] + pp.vel_cols], hue="family", 
                 diag_kind = 'kde', corner = True, markers = markers, palette = 'viridis', dropna = True, height = 3)
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
# https://catherineh.github.io/programming/2016/05/24/seaborn-pairgrid-tips

replacements = {'sepal_length': r'$\alpha$', 'sepal_width': 'sepal',
                'petal_length': r'$\beta$', 'petal_width': 'petal',
                'versicolor': 'bloop'}

for i in range(4):
    for j in range(4):
        xlabel = g.axes[i][j].get_xlabel()
        ylabel = g.axes[i][j].get_ylabel()
        if xlabel in replacements.keys():
            g.axes[i][j].set_xlabel(replacements[xlabel])
        if ylabel in replacements.keys():
            g.axes[i][j].set_ylabel(replacements[ylabel])

In [None]:
g = sns.pairplot(df_rocks.query("instance == 'after'").loc[:, ['family', 'code'] + pp.vel_cols], hue="family", 
                 diag_kind = 'kde', corner = True, markers = markers, palette = 'viridis', dropna = True, height = 3)
g.map_lower(sns.kdeplot, levels=4, color=".2")

#### scatter: velocities

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.scatterplot(x = 'vp0', y = 'vs0', hue = 'code', style = 'family', data = df_rocks.query("instance == 'before'"), 
#                 ax = ax, palette = 'deep', markers = markers, legend = 'full')
# plt.xscale('log')
# plt.yscale('log')

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
g = sns.jointplot(x = 'vp0', y = 'vs0', hue = 'family', data = df_rocks.query("instance == 'before'"), palette = 'deep', height = 15, space = 0.1)
g.plot_joint(sns.kdeplot, color = 'code', zorder=0, levels=6)
g.set_axis_labels('vp$_0$ (m/s)', 'vs$_0$ (m/s)', fontsize = 12, fontweight = 'bold')
g.plot_marginals(sns.rugplot, height = .05, clip_on = False)

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
g = sns.jointplot(x = 'vp0', y = 'vs0', hue = 'family', data = df_rocks.query("instance == 'after'"), palette = 'deep', height = 15, space = 0.1)
# g.plot_joint(sns.kdeplot, color = 'code', zorder=0, levels=6)
g.set_axis_labels('vp$_0$ (m/s)', 'vs$_0$ (m/s)', fontsize = 12, fontweight = 'bold')
g.plot_marginals(sns.rugplot, height = .05, clip_on = False)

#### violin: velocities

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'vp0', hue = 'instance', split = True, data = df_rocks, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'P-wave velocity, $\\theta = 0$, before & after', xlabel = '', ylabel = 'v$_p$(0) (m/s)')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'vs0', hue = 'instance', split = True, data = df_rocks, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'S-wave velocity, $\\theta = 0$, before & after', xlabel = '', ylabel = 'v$_s$(0) (m/s)')

### mech

In [None]:
mech_cols = pp.mech_cols[::2]

#### pairplot

In [None]:
g = sns.pairplot(df_rocks.query("instance == 'before'").loc[:, ['family', 'code'] + mech_cols], hue="family", 
                 diag_kind = 'kde', corner = True, markers = markers, palette = 'viridis', dropna = True, height = 3)
# g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
g = sns.pairplot(df_rocks.query("instance == 'after'").loc[:, ['family', 'code'] + mech_cols], hue="family", 
                 diag_kind = 'kde', corner = True, markers = markers, palette = 'viridis', dropna = True, height = 3)
# g.map_lower(sns.kdeplot, levels=4, color=".2")

#### scatter

In [None]:
g = sns.jointplot(x = 'mech_e0', y = 'mech_k0', hue = 'family', data = df_rocks.query("instance == 'before'"), palette = 'deep', height = 15, space = 0.1)
# g.plot_joint(sns.kdeplot, color = 'code', zorder=0, levels=6)
g.set_axis_labels('E (GPa)', 'K (GPa)', fontsize = 12, fontweight = 'bold')
g.plot_marginals(sns.rugplot, height = .05, clip_on = False)

In [None]:
g = sns.jointplot(x = 'mech_e0', y = 'mech_k0', hue = 'family', data = df_rocks.query("instance == 'after'"), palette = 'deep', height = 15, space = 0.1)
# g.plot_joint(sns.kdeplot, color = 'code', zorder=0, levels=6)
g.set_axis_labels('E (GPa)', 'K (GPa)', fontsize = 12, fontweight = 'bold')
g.plot_marginals(sns.rugplot, height = .05, clip_on = False)

#### violin

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'mech_e0', hue = 'instance', split = True, data = df_rocks, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'Young\'s modulus, $\\theta = 0$, before & after', xlabel = '', ylabel = 'E(0) (GPa)')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'mech_k0', hue = 'instance', split = True, data = df_rocks, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'Bulk modulus, $\\theta = 0$, before & after', xlabel = '', ylabel = '$\\kappa$(0) (GPa)')

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(x = 'family', y = 'mech_n0', hue = 'instance', split = True, data = df_rocks, scale = 'width', palette = 'pastel', bw = 'scott',
               inner = 'quartile')
ax = set_spe_style(ax, title = 'Poisson ratio, $\\theta = 0$, before & after', xlabel = '', ylabel = '$\\nu$(0)')

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.stripplot(y = 'perm', x = 'family', hue = 'instance', dodge = True, data = df_perm_bna, palette = 'viridis', ax = ax)
# plt.yscale('log')
# sns.set_style('darkgrid')
# plt.title('permeability before & after');
# plt.xlabel('')
# plt.ylabel('Permeability (mD)')

In [None]:
# fig, ax = plt.subplots(figsize = (12, 12))
# sns.boxplot(x = 'family', y = 'perm', hue = 'instance', data = df_perm_bna, dodge = True, width = 0.5)
# plt.yscale('log')

In [None]:
df_perm_before_clipped = df_perm_before.copy()
df_perm_before_clipped.loc[:, 'perm'] = df_perm_before_clipped.perm.clip(lower = 0, upper = 500)

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.boxplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.kdeplot(x = 'perm',  hue = 'code', data = df_perm_before_clipped, 
            palette = 'viridis', shade = 'fill', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
tags = ds.tag.str.split('_', expand = True)#.apply(lambda x: pp.get_material_density(x))
tags[1] = 0.0
unique_tags = tags[0].unique()
# tags.set_index([0, tags.index], inplace = True)