In [None]:
# preprocessing with dask
import os, sys, re, io, pathlib
import pandas as pd
import hiplot as hip
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
# packages needed to use dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import multiprocessing.popen_spawn_posix
from distributed import Client, LocalCluster

# limit memory to 1 GB
# client = Client(n_workers=4, threads_per_worker=1, memory_limit=4e9)

buffer = io.StringIO()
mix = pd.IndexSlice

# define the current path (notebooks in lab_utils)
labutilspath = str(pathlib.Path(os.getcwd()).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

pp = autoscan.basics()

ftir_cols = pp.probe_settings['ftir']['col'][2:]
tips_cols = list(itertools.chain(*[p['col'][2:] for _, p in pp.probe_settings.items()]))
ftir_lambdas = pp.probe_settings['ftir']['lambdas']

def pprint(msg, msg_title = '', msg_decorator = '#', len_decorator = 40):
    nhead = len_decorator - len(msg_title) - 2
    if nhead <= 0:
        nhead = 1
        nfoot = len(msg_title) + 4
    else:
        nfoot = len_decorator
    
    top_decorator = msg_decorator * (nhead // 2) 
    print(top_decorator + ' ' + msg_title  +  ' ' + top_decorator, 
          msg, nfoot * '#' + '\n',
          sep = '\n')
    return

def dfinfo(df, header = 'info'):
    with io.StringIO() as buffer:
        df.info(buf = buffer)
        pprint(buffer.getvalue(), msg_title = header)

In [None]:
cluster = LocalCluster(name = 'dask', n_workers = 5, threads_per_worker = 4)
client = Client(cluster)
client

In [None]:
pbar = ProgressBar()
pbar.register()

In [None]:
# datapath = '/home/urlab/sandbox/data/characterization/autoscan/autoscan.h5'
datapath = '/sandbox/data/autoscan/'
datafile = os.path.join(datapath, 'autoscan.h5')
savepath = datapath

# load the data
da = dd.read_hdf(datafile, '/data', chunksize = 10000)
dn = da.iloc[:, -1760:].copy()
ds = da.iloc[:, :8].copy()
desc = dd.read_hdf(datafile, '/description').compute()

In [None]:
def idx_peak_to_lambda(x):
    if x is not np.nan:
        out = ftir_lambdas[int(x.split('_')[1]) - 1]
    else:
        out = np.nan
    return out

def ftir_row_stats(df: dd.DataFrame) -> dd.DataFrame:
    return (
        df
        .assign(
            l_mean = lambda df: df.loc[:, ftir_cols].mean(axis = 1),          
            l_std = lambda df: df.loc[:, ftir_cols].std(axis = 1),
            # l_median = lambda df: np.median(df.iloc[:, 2:1754], axis = 1)
        )
    )

def rock_mechanics(df: dd.DataFrame) >> dd,DataFrame:
    
def clean_dataframe(df: dd.DataFrame) -> dd.DataFrame:
    return (
        df
        .where(df >= 0, np.nan)
        .astype(np.float32)
    )

def enforce_limits(df: dd.DataFrame) -> dd.DataFrame:
    for k, p in pp.probe_settings.items():
        v = p['col'][2:]
        vmin, vmax = p['limits']
        df[v] = df[v].where(((df[v] >= vmin) & (df[v] <= vmax)), np.nan)
    return df

def compute_final_dataframe(df: dd.DataFrame, workers = 20) -> pd.DataFrame:
    """Execute dask task graph and compute final results"""
    return (
        df
        .compute(num_workers = 6)
    )

def hip_visualize(df, pcols = None, index = ['family', 'code']):
    dp = df.reset_index().loc[:, np.append(index, pcols)]
    s = hip.Experiment.from_dataframe(dp)
    s.colormap = 'interpolateViridis'
    s.display()
    return s

In [None]:
dn = clean_dataframe(dn)
dn = enforce_limits(dn)
dn = ftir_row_stats(dn)

In [None]:
df = dn.compute()
ds = ds.compute()

In [None]:
idx_max_peaks = df.loc[:, ftir_cols].idxmax(axis = 1)
idx_min_peaks = df.loc[:, ftir_cols].idxmin(axis = 1)
df.loc[:, 'l_max_peak'] = idx_max_peaks.apply(lambda x: idx_peak_to_lambda(x))
df.loc[:, 'l_min_peak'] = idx_min_peaks.apply(lambda x: idx_peak_to_lambda(x))

In [None]:
df = ds.join(df)

In [None]:
# s = hip_visualize(df.dropna(subset = ['perm', 'vp0', 'vs0', 'e_star', 'l_max_peak']), 
#                   pcols = ['l_max_peak', 'l_min_peak', 'perm', 'vp0', 'vs0', 'e_star'], 
#                   index = ['code'])

# data cleaning with klib
1. pre-clean the dataset
 - remove duplicated rows
 - enforce correct dtypes 
 - reduce memory overhead
 - do not remove missing values

In [None]:
import klib

In [None]:
# for the record print the information of the original dataframe
dfinfo(df, 'raw data')

In [None]:
# pre-clean, do not remove missing values
df = klib.data_cleaning(df, drop_threshold_rows = 1.0, clean_col_names = False)

In [None]:
col_numerical = ['x', 'y'] + tips_cols + ['l_mean', 'l_std', 'l_max_peak', 'l_min_peak']
col_categorical = ['family', 'code', 'tag', 'subtag', 'instance', 'experiment', 'side', 'm']
df.loc[:, col_numerical] = df.loc[:, col_numerical].astype(np.float32)
# df.loc[:, col_categorical] = df.loc[:, col_categorical].astype('category')

In [None]:
# print the information of the cleaned dataframe
dfinfo(df, 'raw data cleaned')

## fix values and correct information
1. set nan to measurements where all values are the same (ftir)
2. set the correct family and code for eur samples

In [None]:
ix = df.loc[:, ftir_cols].apply(lambda x: len(np.unique(x)), axis = 1) == 1
df.loc[ix, ftir_cols] = np.nan

In [None]:
df.loc[df.tag.str.contains('eur'), 'family'] = 'shale'
df.loc[df.tag.str.contains('eur'), 'code'] = 'sh'
df.loc[:, col_categorical] = df.loc[:, col_categorical].astype('object')

In [None]:
df.loc[:, col_numerical].to_hdf(os.path.join('/sandbox/data/', 'autoscan_corrected.h5'), key = 'data', format = 'table', mode = 'w')
df.loc[:, col_categorical].to_hdf(os.path.join('/sandbox/data/', 'autoscan_corrected.h5'), key = 'desc', mode = 'a')

In [None]:
repeat_ftir = df.loc[ix, :].set_index(col_categorical[:-1]).index.unique()
pd.DataFrame.from_records(repeat_ftir.to_numpy(), columns = col_categorical[:-1]).to_csv(os.path.join(datapath, 'ftir_repeat.csv'))

# visualization
1. hip-plot (again) but with corrected data
2. distributions

## hip
### without `e_star`

In [None]:
s = hip_visualize(df.query("instance == 'before'").dropna(subset = ['perm', 'vp0', 'vs0', 'l_max_peak']), 
                  pcols = ['l_max_peak', 'l_min_peak', 'perm', 'vp0', 'vs0'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_woestar.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = ['perm', 'vp0', 'vs0', 'l_max_peak']), 
                  pcols = ['l_max_peak', 'l_min_peak', 'perm', 'vp0', 'vs0'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_westar.html'));

### with `e_star`
the number of samples with impulse hammer measurements are 1/4th of the previous

In [None]:
s = hip_visualize(df.query("instance == 'before'").dropna(subset = ['perm', 'vp0', 'vs0', 'e_star', 'l_max_peak']), 
                  pcols = ['l_max_peak', 'l_min_peak', 'perm', 'vp0', 'vs0', 'e_star'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_woestar.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = ['perm', 'vp0', 'vs0', 'e_star', 'l_max_peak']), 
                  pcols = ['l_max_peak', 'l_min_peak', 'perm', 'vp0', 'vs0', 'e_star'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_westar.html'));

In [None]:
s = hip_visualize(df.query("instance == 'before'").dropna(subset = ['perm', 'vp0', 'vs0']), 
                  pcols = ['perm', 'vp0', 'vs0'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_before_permvel.html'));

In [None]:
s = hip_visualize(df.query("instance == 'after'").dropna(subset = ['perm', 'vp0', 'vs0']), 
                  pcols = ['perm', 'vp0', 'vs0'], 
                  index = ['code'])

s.to_html(os.path.join(savepath, 'hip_after_permvel.html'));

In [None]:
df_before = df.query("instance == 'before'")

In [None]:
ix_perm = df_before.perm.isna() == False
df_perm_before = df_before.loc[ix_perm, ['family', 'code', 'perm']]

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.stripplot(y = 'perm', x = 'family', hue = 'code', data = df_perm_before, palette = 'viridis', ax = ax)
plt.yscale('log')
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
df_perm_before_clipped = df_perm_before.copy()
df_perm_before_clipped.loc[:, 'perm'] = df_perm_before_clipped.perm.clip(lower = 0, upper = 500)

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.violinplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.boxplot(y = 'perm', x = 'code', hue = 'family', data = df_perm_before_clipped, palette = 'viridis', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');

In [None]:
fig, ax = plt.subplots(figsize = (12, 12))
sns.kdeplot(x = 'perm',  hue = 'code', data = df_perm_before_clipped, 
            palette = 'viridis', shade = 'fill', ax = ax)
sns.set_style('darkgrid')
plt.title('permeability before');