In [97]:
import os, sys, re, io
import pandas as pd
import numpy as np
import scipy as sp

buffer = io.StringIO()

def probeix(df, vmin, vmax):
    return np.logical_and((df >= vmin).all(axis = 1), (df <= vmax).all(axis = 1))

def test(x, th = 0.5, vmin = 0, vmax = 1e6):
    s = probeix(x, vmin = vmin, vmax = vmax)
    test = s.sum() / len(s) >= th
#     ntrue  = s.sum()
#     nfalse = (s == False).sum()
    return test

def get_wrong_measurements(df, probe = None, desc = None, th = 0.5, vmin = 0, vmax = 1e6):
#     desc2 = desc.sort_index()
    ix = vels.groupby(level = desc.index.names).apply(test, vmin = vmin, vmax = vmax)
    if not np.logical_or(probe is None, desc is None):
        out = desc.loc[ix == False].query("probe == '%s'" %(probe))
        out = (out, ix)
    else:
        out = ix
    return out

def pprint(msg, msg_title = '', msg_decorator = '#', len_decorator = 40):
    nhead = len_decorator - len(msg_title) - 2
    if nhead <= 0:
        nhead = 1
        nfoot = len(msg_title) + 4
    else:
        nfoot = len_decorator
    
    top_decorator = msg_decorator * (nhead // 2) 
    print(top_decorator + ' ' + msg_title  +  ' ' + top_decorator, 
          msg, nfoot * '#' + '\n',
          sep = '\n')
    return

def dfinfo(df, header = 'info'):
    with io.StringIO() as buffer:
        df.info(buf = buffer)
        pprint(buffer.getvalue(), msg_title = header)

mix = pd.IndexSlice

In [2]:
datapath = '/home/urlab/sandbox/data/characterization/autoscan/autoscan.h5'
savepath = '/home/urlab/Documents/'

# load the data
df = pd.read_hdf(datapath, key = 'data')
desc = pd.read_hdf(datapath, key = 'description')

In [3]:
velcols = df.columns[-4:-2]
permcol = 'perm'
hammcol = 'e_star'

describe_cols = ['mean', 'std', 'min', 'max',]

In [231]:
# set min and max expected for measurement
velmin = 0.5e3
velmax = 8.0e3

# check velocities
vels = df.loc[:, [velcols[1]]]

## identify offending values and those to keep
velidx = probeix(vels, vmin = velmin, vmax = velmax)

dfinfo(vels, 'info of raw')
## firtst check how they are distributed
pprint(vels.describe().apply(np.round, decimals = 2), 'raw data')


## check the data makes sense
vels.loc[velidx == False, :] = np.nan
# dfinfo(vels, 'info of corrected')
pprint(vels.describe().apply(np.round, decimals = 2), 'correct measurements')

# get the labels that have problem
out, ix = get_wrong_measurements(vels, probe = 'vel', desc = desc, vmin = velmin, vmax = velmax)
ixd = vels.loc[ix, :].index
# the ix returned from `get_wrong_measurements` keeps only samples where  more than a threshold percent (`th`) of values are correct.
# samples that don't meet this criteria are lost. This is different than probeix, which only asserts if the values are within a range independently of their sample. 

# ix can be used to do basic data inputation on the sample, for example by filling it with the mean
# vels.loc[ixd] = vels.loc[ixd].groupby(level = desc.index.names).apply(lambda x: x.fillna(x.mean()))
# ixs = vels.dropna().index

# set all the samples that did not meet the criteria to nan
# vels.loc[mix[ix == False, :]] = np.nan

# dfinfo(vels.loc[mix[ix, :]], 'info of mix')
pprint(vels.loc[ixd, :].describe().apply(np.round, decimals = 2), 'correct + data inputation')

############# info of raw #############
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 79774 entries, ('carbonate', 'ah', 'ah_001', 's0', 'after', 4, 0) to ('sandstone', 'sg', 'wsg_006', 'plugs', 'before', 4, 3)
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   vs_0    51666 non-null  float64
dtypes: float64(1)
memory usage: 3.9+ MB

########################################

############### raw data ###############
           vs_0
count  51666.00
mean    2530.42
std     3641.12
min   -89552.24
25%     1886.74
50%     2180.80
75%     2699.68
max     7526.48
########################################

######### correct measurements #########
           vs_0
count  51566.00
mean    2677.26
std     1374.35
min      905.69
25%     1890.67
50%     2181.18
75%     2700.51
max     7526.48
########################################

###### correct + data inputation ######
           vs_0
count  48447.00
mean    2715.13
std     1395.56


In [329]:
# df.loc[ixs, df.columns[:-2]]
c = ['x', 'y'] + [vels.columns.values[0]]
pprint(df.loc[ixd, c[-1]].describe())

# this for loop should be a function to usee apply
for g, d in df.loc[ixd, c].groupby(level = desc.index.names):    
#     s = d.loc[d.loc[:, c[-1]].isna()].index.get_level_values(6)
    m = d.iloc[:, -1].isna().values
    s = d.loc[m].index.get_level_values(6)
    if len(s) > 0:
        x, y, v = d.values.T
        if np.diff(s, n = len(s) - 1) != 0:
            vu = np.repeat(v, 2).reshape((len(v), 2))
            for k, u in enumerate([x, y]):
                vu[m, k] = np.interp(u[m], u[m == False], v[m == False])
            v = np.mean(vu, axis = 1)
        else:
            v[m] = v.mean()
            
        df.loc[g, c[-1]]  = v
pprint(df.loc[ixd, c[-1]].describe())      
# np.interp

###################  ###################
count    48447.000000
mean      2715.128479
std       1395.557082
min        905.689672
25%       1942.147137
50%       2190.729043
75%       2709.686160
max       7526.477071
Name: vs_0, dtype: float64
########################################



  result = self._run_cell(
  coro.send(None)


###################  ###################
count    52218.000000
mean      2692.374593
std       1357.369170
min        905.689672
25%       1931.693923
50%       2231.704026
75%       2704.264529
max       7526.477071
Name: vs_0, dtype: float64
########################################



In [328]:
# np.mean(vu, axis = 1)

# np.interp()
# t = df.loc[d.index, c]

# s = [1, 2, 3, 4, 6, 9, 10, 13, 15, 17, 18, 21, 22, 23, 24]

# np.diff(t.loc[t.iloc[:, -1].isna().any()].index.get_level_values(6), n = 2)
# t = df.loc[d.index, ['x', 'y', 'vp_0', 'vs_0']]
# print(t.mean())
# t.fillna(t.mean())
# s = vels.index.droplevel(6)
# d = [False] * len(s)
# for j in ix[ix].index.values:
#     for k, v in enumerate(s.values):
#         if v == j:
#             d[k] = True

m
0     2770.868931
1     2722.217181
2     2732.435967
3     2848.210713
4     2979.949199
5     2971.705122
6     3077.599472
7     3147.269744
8     3196.760616
9     3247.807730
10    3310.762342
11    3329.026735
12    3347.493763
13            NaN
14            NaN
15            NaN
16            NaN
17            NaN
18            NaN
19            NaN
20            NaN
Name: vs_0, dtype: float64

In [291]:
# (np.diff(t.loc[t.iloc[:, -1].isna().any(axis = 1)].index.get_level_values(6), n = 2) == 0).all()
# vu[:, 0]
# v = velidx.index.droplevel(6).values
# t = ix[ix.index.values
# x = [vels.index[k] for j in range(len(t)) for k in range(len(s.values)) if v[k] == t[j]]

In [94]:
perm = df.loc[:, [permcol]]
pprint(perm.describe())
permidx = probeix(perm, vmin = 0, vmax = np.inf)
## check the data makes sense
pprint(perm.loc[permidx, :].describe())

## print the labels that have problem
out, _  = get_wrong_measurements(perm, 'perm', desc)
out.sample(10)

###################  ###################
               perm
count  4.986200e+04
mean   1.061643e+03
std    4.451099e+04
min    4.698540e-01
25%    1.819275e+00
50%    2.548140e+00
75%    1.141058e+02
max    5.307500e+06
########################################

###################  ###################
               perm
count  4.986200e+04
mean   1.061643e+03
std    4.451099e+04
min    4.698540e-01
25%    1.819275e+00
50%    2.548140e+00
75%    1.141058e+02
max    5.307500e+06
########################################



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,probe,relroot,loaded
family,code,tag,subtag,instance,side,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
shale,sh,sh_008,s0,before,3,perm,sh_008/processed/perm_before_top.csv,True
sandstone,bg,bg_009,s0,before,3,perm,bg_009/subsamples/s0/processed/perm_before_top...,True
carbonate,ls,lssy_011,s3,before,4,perm,lssy_011/subsamples/s3/processed/perm_before_a...,True
sandstone,bg,bg_012,s0,before,7,perm,bg_012/processed/perm_before_d.csv,True
carbonate,ls,lssy_011,s1,before,5,perm,lssy_011/subsamples/s1/processed/perm_before_b...,True
sandstone,bg,bg_004,s2,after,4,perm,bg_004/subsamples/s2/processed/perm_after.csv,True
sandstone,bg,bg_011,s0,before,6,perm,bg_011/processed/perm_before_c.csv,True
shale,sh,sh_006,s0,before,6,perm,sh_006/processed/perm_before_c.csv,True
shale,sh,sh_006,s0,before,5,perm,sh_006/processed/perm_before_b.csv,True
sandstone,bg,bg_004,s1,after,4,perm,bg_004/subsamples/s1/processed/perm_after.csv,True


In [None]:
# remove offending values and keep just good measurements
idx = np.logical_and(velidx.values, permidx.values)
dc = df.loc[idx, :]

In [None]:
# alternative, fill the values with means or nans
vels.loc[velidx == False, :] = np.nan
desc.loc[vels.index[velidx == False].droplevel(6).drop_duplicates()].query("probe == 'vel'")

In [None]:
# x = velidx[velidx == False].index.droplevel(6).drop_duplicates()
# for t in x:
#     vels.loc[mix[[*t], :], :]

In [None]:
# means = vels.groupby(level = velidx.index.names[:-1], sort = False).apply(np.mean)

# for dd in means.index:
#     pass
# dd

In [None]:
# mix = pd.IndexSlice
# mix[[*ix], :]
# vels.loc(axis = 0)[mix[[dd], :], :]

In [None]:
# def myquery(x, vels, velmin = 0.5e3, velmax = v):
#     v = vels.copy()
#     s = v.loc[x, :].shape[0]
#     idx = np.logical_and((v >= velmin).all(axis = 1), (v <= velmax).all(axis = 1))
#     skeep = np.sum(idx)
#     sdrop = np.sum(idx == False)
#     return s, skeep, sdrop

# for a, b in idx.loc[revise_idx, :].groupby(level = idx.index.names[2:-1]):
#     pass#.describe()

# x = revise_idx.droplevel(6).drop_duplicates()
# idx.loc[slice(x, :), :]

# idx.loc[revise_idx, :]
# idx.groupby(level = idx.index.names[2:-1]).describe()
# .loc[:, 'r'] = 'review'

# mix = pd.IndexSlice
# pd.concat((idx.loc[mix[[*t], :]] for t in x)) 

# mix = pd.IndexSlice
# pd.concat((idx.loc[mix[[*t], :]] for t in x)) 