In [None]:
# preprocessing with dask
import os, sys, re, io, pathlib
import pandas as pd
import hiplot as hip
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools

idx = pd.IndexSlice
buffer = io.StringIO()
idx = pd.IndexSlice

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics(material_info = True)

def hip_visualize(df, pcols = None, index = ['family', 'code']):
    dp = df.reset_index().loc[:, np.append(index, pcols)]
    s = hip.Experiment.from_dataframe(dp)
    s.colormap = 'interpolateViridis'
    s.display()
    return s


In [None]:
# define paths
datapath = '/sandbox/data/autoscan/'
vispath = '/sandbox/vis/autoscan/'
savepath = datapath
datafname = 'autoscan_corrected.h5'

datafile = os.path.join(datapath, datafname)
figspath = os.path.join(vispath, 'rock_multiphysics_display')

In [None]:
# read the data
dd = pd.read_hdf(datafile, key = 'data')
ds = pd.read_hdf(datafile, key = 'desc')
df = ds.join(dd)

In [None]:
# define important columns (categorical and numerical)
col_numerical = pp.grid_cols + pp.probe_cols + ['l_max_peak', 'l_min_peak']

In [None]:
ds.loc[:, 'ix'] = ds.index.values
dx = ds.pivot_table(index = ['tag', 'subtag'], 
                    columns = 'experiment', 
                    values = 'ix', 
                    aggfunc = lambda x: [*x]).loc[:, ['before', 'heat_treatment', 'perf']]
dx.dropna(thresh = 2, inplace = True)

# dp.loc[idx[dx.index[1], :], 'ix']

# get the index of samples with before and after characterization 
ix = list(itertools.chain(*dx.apply(lambda x: [*itertools.chain(*[s for s in x if type(s) != float])], axis = 1).values))

# separate the values in another dataframe
df_bna = df.loc[ix, :]

# visualization
1. hip-plot (again) but with corrected data
2. distributions

In [None]:
# mech_cols = df.columns[df.columns.str.contains(r'mech[_][a-z]0')].values
# mech_cols = ['mech_' + s + '0' for s in ['e', 'k','n']]
subset_cols = ['l_max_peak', 'perm'] + pp.mech_cols[::2]
print(subset_cols)

## hip
### without `e_star`

In [None]:
df_before = df.query("instance == 'before'")
df_after  = df.query("instance == 'after'")

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-perm-and-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-perm-and-mechx0.html'));

### with `e_star`
the number of samples with impulse hammer measurements are 1/4th of the previous

In [None]:
subset_cols += ['e_star']

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-perm-and-mechx0-estar.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-perm-and-mechx0-estar.html'));

In [None]:
subset_cols = subset_cols[1:-1]

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_permvel-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_permvel-mechx0.html'));

In [None]:
subset_cols = ['l_max_peak'] + subset_cols[1:]

In [None]:
s = hip_visualize(df_before.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_before_ftirmaxloc-mechx0.html'));

In [None]:
s = hip_visualize(df_after.dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(figspath, 'hip_after_ftirmaxloc-mechx0.html'));

In [None]:
groupby_cols =['code', 'instance']
probes = ['perm']
subset_cols = groupby_cols + probes

In [None]:
# df_bna.perm.clip(lower = 0, upper = 500, inplace = True)
df_bna

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (24, 12))
df.loc[:, subset_cols].dropna().groupby('instance').boxplot(by = 'code', column = 'perm', ax = ax);

In [None]:
# df_bna.loc[:, subset_cols].plot(kind = 'kde')
fig, ax = plt.subplots(figsize = (12, 12))
sns.kdeplot(x = 'perm',  hue = 'code', data = df_bna, clip = [0, 500], vertical = True,
            palette = 'viridis', shade = 'fill', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
ix_perm = df_before.perm.isna() == False
df_perm_before = df_before.loc[ix_perm, ['family', 'code', 'perm']]

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(y = 'perm', x = 'family', hue = 'code', data = df_perm_before, palette = 'viridis', ax = ax)
plt.yscale('log')
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
df_perm_before_clipped = df_perm_before.copy()
df_perm_before_clipped.loc[:, 'perm'] = df_perm_before_clipped.perm.clip(lower = 0, upper = 500)

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.boxplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.kdeplot(x = 'perm',  hue = 'code', data = df_perm_before_clipped, 
            palette = 'viridis', shade = 'fill', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
tags = ds.tag.str.split('_', expand = True)#.apply(lambda x: pp.get_material_density(x))
tags[1] = 0.0
unique_tags = tags[0].unique()
# tags.set_index([0, tags.index], inplace = True)