In [None]:
# preprocessing with dask
import os, sys, re, io, pathlib
import pandas as pd
import hiplot as hip
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools

buffer = io.StringIO()
idx = pd.IndexSlice

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics(material_info = True)

def pprint(msg, msg_title = '', msg_decorator = '#', len_decorator = 40):
    nhead = len_decorator - len(msg_title) - 2
    if nhead <= 0:
        nhead = 1
        nfoot = len(msg_title) + 4
    else:
        nfoot = len_decorator
    
    top_decorator = msg_decorator * (nhead // 2) 
    print(top_decorator + ' ' + msg_title  +  ' ' + top_decorator, 
          msg, nfoot * '#' + '\n',
          sep = '\n')
    return

def dfinfo(df, header = 'info'):
    with io.StringIO() as buffer:
        df.info(buf = buffer)
        pprint(buffer.getvalue(), msg_title = header)

In [None]:
# define important columns (categorical and numerical)
ftir_cols = pp.probe_settings['ftir']['col'][2:]
tips_cols = list(itertools.chain(*[p['col'][2:] for _, p in pp.probe_settings.items()]))
ftir_lambdas = pp.probe_settings['ftir']['lambdas']

col_numerical = ['x', 'y'] + tips_cols + ['l_max_peak', 'l_min_peak']
col_categorical = ['family', 'code', 'tag', 'subtag', 'instance', 'experiment', 'side', 'm']

In [None]:
# load the data
datapath = '/sandbox/data/autoscan/'
datafile = os.path.join(datapath, 'autoscan.h5')
savepath = datapath

df = pd.read_hdf(datafile, key = 'data')
df_description = pd.read_hdf(datafile, key = 'description')

In [None]:
# fix some tags
df.loc[df.tag.str.contains('eur'), 'family'] = 'shale'
df.loc[df.tag.str.contains('eur'), 'code'] = 'sh'

In [None]:
ds = df.loc[:, ['tag']]
ds = ds.assign(rho = 0.0)
ds = ds.assign(
    basetag = ds.tag.str.split('_', expand = True, n = 1)[0].values
)

for t in ds.basetag.unique():
    ds['rho'] = ds['rho'].mask(ds['basetag'] == t, pp.get_material_density(t))

ds.index.name = 'ix'
# ds = ds.set_index(['code', ds.index])

In [None]:
def idx_peak_to_lambda(x):
    if np.logical_and(x != np.nan, type(x) == str):
        out = ftir_lambdas[int(x.split('_')[1]) - 1]
    else:
        out = np.nan
    return out

def ftir_row_stats(df):
    return (
        df
        .assign(
            l_mean = lambda df: df.loc[:, ftir_cols].mean(axis = 1),          
            l_std = lambda df: df.loc[:, ftir_cols].std(axis = 1),
            # l_median = lambda df: np.median(df.iloc[:, 2:1754], axis = 1)
        )
    )

def ftir_extreme_locations(df):
    idx_max_peaks = df.loc[:, ftir_cols].idxmax(axis = 1)
    idx_min_peaks = df.loc[:, ftir_cols].idxmin(axis = 1)
    return (
        df
        .assign(
            l_max_peak = idx_max_peaks.apply(lambda x: idx_peak_to_lambda(x)),
            l_min_peak = idx_min_peaks.apply(lambda x: idx_peak_to_lambda(x))        
        )
    
    )
    
def clean_dataframe(df):
    return (
        df
        .where(df >= 0, np.nan)
        .astype(np.float32)
    )

def enforce_limits(df):
    for k, p in pp.probe_settings.items():
        v = p['col'][2:]
        vmin, vmax = p['limits']
        df[v] = df[v].where(((df[v] >= vmin) & (df[v] <= vmax)), np.nan)
    return df

def hip_visualize(df, pcols = None, index = ['family', 'code']):
    dp = df.reset_index().loc[:, np.append(index, pcols)]
    s = hip.Experiment.from_dataframe(dp)
    s.colormap = 'interpolateViridis'
    s.display()
    return s

In [None]:
df.loc[:, tips_cols[:8]].describe()

In [None]:
# enforce limits, set non-physical values to nan
df = enforce_limits(df)

In [None]:
df.loc[:, tips_cols[:8]].describe()

In [None]:
# calculate the mechanical properties
vcols = ['vp0', 'vs0', 'vp90', 'vs90']
vels  = df.loc[:, vcols].values
vels2 = np.power(vels, 2)
rho = ds.loc[:, 'rho'].values.reshape(ds.shape[0], 1)

E = np.multiply(rho, np.multiply(vels2[:, 1::2], 3.0 * vels2[:, 0::2] - 4.0 * vels2[:, 1::2]))
df.loc[:, ['mech_e0', 'mech_e90']] = np.divide(E, vels2[:, 0::2] - vels2[:, 1::2]) / 1e6
df.loc[:, ['mech_l0', 'mech_l90']] = np.multiply(rho, vels2[:, 0::2] - 2.0 * vels2[:, 1::2])
df.loc[:, ['mech_k0', 'mech_k90']] = np.multiply(rho, vels2[:, 0::2] - (4 / 3) * vels2[:, 1::2]) / 1e6
df.loc[:, ['mech_n0', 'mech_n90']] = np.divide(vels2[:, 0::2] - 2.0 * vels2[:, 1::2], 2.* (vels2[:, 0::2] - vels2[:, 1::2]))
df.loc[:, ['mech_i0', 'mech_i90']] = np.multiply(rho, vels[:, 0::2])
df.loc[:, ['mech_m0', 'mech_m90']] = np.multiply(rho, vels2[:, 0::2])
df.loc[:, ['mech_g0', 'mech_g90']] = np.multiply(rho, vels2[:, 1::2])
df.loc[:, 'rho'] = rho

# remove the data we don't need
del E, rho, vels2

In [None]:
mechs = df.columns[df.columns.str.contains('mech')]
df.loc[:, mechs].describe()

In [None]:
# check which values do not make sense. -1 <= pr <= 0.5
for i in ['0', '90']:
    ix = np.logical_and(df.loc[:, 'mech_n'+i] >= -1, df.loc[:, 'mech_n'+i] <= 1.0)
    temp_cols = df.columns[df.columns.str.contains(r'mech[_][a-z]'+i)]
    df.loc[ix == False, temp_cols] = np.nan
    df.loc[ix == False, col_categorical[:-1]].drop_duplicates().merge(df_description.loc[df_description.probe == 'vels'], on = col_categorical[:-1]).to_csv(os.path.join(savepath, 'repeat_vels_' + i + '.csv'))    

In [None]:
# find the peaks of the ftir
df = ftir_extreme_locations(df)

# data cleaning
1. pre-clean the dataset
 - remove duplicated rows
 - enforce correct dtypes 
 - reduce memory overhead
 - do not remove missing values

In [None]:
# for the record print the information of the original dataframe
dfinfo(df, 'raw data')

In [None]:
# import klib
# # pre-clean, do not remove missing values
# df = klib.data_cleaning(df, drop_threshold_rows = 1.0, clean_col_names = False)
# df.loc[:, col_categorical] = df.loc[:, col_categorical].astype('object')

In [None]:
# downcast to float32 to save some memory. this is likely useless in the current context. 
# df = df.apply(pd.to_numeric, downcast = 'float', errors = 'ignore')
# df.loc[:, col_categorical] = df.loc[:, col_categorical].astype('category')
# print the information of the cleaned dataframe
# dfinfo(df, 'raw data cleaned')

## fix values and correct information
1. set nan to measurements where all values are the same (ftir)
2. set the correct family and code for eur samples

In [None]:
ix = df.loc[:, ftir_cols].apply(lambda x: len(np.unique(x)), axis = 1) == 1
df.loc[ix, ftir_cols] = np.nan

In [None]:
df.loc[:, col_numerical].to_hdf(os.path.join(savepath, 'autoscan_corrected.h5'), key = 'data', format = 'table', mode = 'w')
df.loc[:, col_categorical].to_hdf(os.path.join(savepath, 'autoscan_corrected.h5'), key = 'desc', mode = 'a')

In [None]:
repeat_ftir = df.loc[ix, :].set_index(col_categorical[:-1]).index.unique()
pd.DataFrame.from_records(repeat_ftir.to_numpy(), columns = col_categorical[:-1]).to_csv(os.path.join(savepath, 'ftir_repeat.csv'))

# visualization
1. hip-plot (again) but with corrected data
2. distributions

## hip
### without `e_star`

In [None]:
# mech_cols = df.columns[df.columns.str.contains(r'mech[_][a-z]0')].values
mech_cols = ['mech_' + s + '0' for s in ['e', 'k','n']]
subset_cols = ['l_max_peak', 'perm'] + list(mech_cols)
print(subset_cols)
s = hip_visualize(df.query("instance == 'before'").dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_woestar.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_westar.html'));

### with `e_star`
the number of samples with impulse hammer measurements are 1/4th of the previous

In [None]:
subset_cols += ['e_star']
s = hip_visualize(df.query("instance == 'before'").dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_woestar.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_westar.html'));

In [None]:
subset_cols = subset_cols[1:-1]
s = hip_visualize(df.query("instance == 'before'").dropna(subset = subset_cols), 
                  pcols = subset_cols, 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_permvel.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = subset_cols), 
                  pcols = subset_cols,
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_permvel.html'));

In [None]:
df_before = df.query("instance == 'before'")

In [None]:
ix_perm = df_before.perm.isna() == False
df_perm_before = df_before.loc[ix_perm, ['family', 'code', 'perm']]

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(y = 'perm', x = 'family', hue = 'code', data = df_perm_before, palette = 'viridis', ax = ax)
plt.yscale('log')
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
df_perm_before_clipped = df_perm_before.copy()
df_perm_before_clipped.loc[:, 'perm'] = df_perm_before_clipped.perm.clip(lower = 0, upper = 500)

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.boxplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.kdeplot(x = 'perm',  hue = 'code', data = df_perm_before_clipped, 
            palette = 'viridis', shade = 'fill', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
tags = ds.tag.str.split('_', expand = True)#.apply(lambda x: pp.get_material_density(x))
tags[1] = 0.0
unique_tags = tags[0].unique()
# tags.set_index([0, tags.index], inplace = True)