In [1]:
%matplotlib inline
import os, shutil, warnings, string
import pandas as pd
import deepdish as dp
import numpy as np
import re
import fnmatch

# libs for plotting
import bokeh
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.models.tools import HoverTool
from bokeh.models import LinearColorMapper, ColorBar, BasicTicker, Select, PrintfTickFormatter
from bokeh.transform import jitter
from bokeh.sampledata.commits import data
from bokeh.palettes import brewer
import holoviews

# libs for stats
import time
from sklearn.manifold import TSNE

# print in notebok
output_notebook()

In [2]:
import sys
labutilspath = '/sandbox/dev/lab_utils/'
sys.path.append(labutilspath)
from _helpers.basics import basic_info
info = basic_info()

In [3]:
datapath = '/media/sanromd/data/lab/data/characterization/autoscan/'
datapath_processed = 'processed'
datapath_raw       = 'raw'
datapath_generic   = '_generic*'
datapath_fluids    = '_fluids'
datapath_analysis  = '_analysis'
datapath_exclude   = ['_special-studies', 'special_studies', '_special_studies', '_unsorted',
                      datapath_raw,datapath_generic,datapath_analysis,'*layout*']
files_exclude      = ['.*','_*','*.asd','*.tcl']
files_include      = ['*.csv']

# List and correct filenames (enforce patterns)

1. Read the _csv_ files in the `datapath`
1. Find whether each file has the word `before` or `after`
1. If not, then add `before` as default
1. Fix all file names such that they follow `probe-before|after-side.csv` 
1. Create a dataframe with a list of all measurements

## file handling functions

In [298]:
def _rename_file(root,fname_old, fname_new):
    oldname = os.path.join(root,fname_old)
    newname = os.path.join(root,fname_new)
    shutil.move(oldname,newname)
    return

def _get_sides(x):
    s = re.findall(r'[a-z]+[.]',x)
    side = None
    if len(s)>0:
        if not re.findall('before|after|map',s[-1]):
            side = s[-1].split('.')[0]
    return side

def add_before_fname(fname,root,dryrun=False, debug=False):
    name2 = re.sub(r'(perm|vel|impulse|ftir).*([a-z]+)[.]([a-z]+)',r'\1_before_\2.\3',fname)
    if debug: print(root.split('/')[len(datapath.split('/'))-1], fname, name2, sep='\t')
    if not dryrun:
        _rename_file(root, fname, name2)
    return name2

def swap_instance_fname(fname,root,dryrun=False, debug=False):
    name2 = re.sub(r'(perm|vel|impulse|ftir)(_|-)([a-z]+|[0-9]+)(_|-).*(before|after).*[.]([a-z]+)',
                   r'\1_\5_\3.\6',
                   fname)
    if debug: print('swaping', root.split('/')[len(datapath.split('/'))-1], fname,name2,sep='\t')
    if not dryrun:
        _rename_file(root, fname, name2)
    return name2

def check_autoscan_fname(fname, root, dryrun=False, debug=False):
    instance = None
    if (not 'before' in fname) and (not 'after' in fname):
        warnings.warn('before or after not found in ' + 
                      root.split('/')[len(datapath.split('/'))-1] + fname)
        fname = add_before_fname(fname, root, dryrun=dryrun, debug=debug)
    instance = re.findall('before|after',fname)
    if len(instance)==1:
        instance = instance[0]
        if debug: print(root.split('/')[len(datapath.split('/'))-1], instance, fname, sep='\t')
        if len(fname.split('_'))>2:
            tst = re.match(r'(perm|vel|impulse|ftir)(-|_)(before|after).*([a-z]+)[.]([a-z]+)', fname)
            if tst is None:
                fname = swap_instance_fname(fname, root, dryrun=dryrun, debug=debug)
    else:
        if len(instance)>1:
            warning = fname + ' has more than one instance: ' + ', '.join(instance)
            warning = warning + ' and cannot choose! \n check ' + os.path.join(root,fname) 
        if len(instance)==0:
            warning = fname + ' not in either category (!) \n please review!'
        warnings.warn(warning)
    return instance, fname

## file wrangling & save

In [5]:
dryrun = True
debug  = False

excludes = r'|'.join([fnmatch.translate(x) for x in files_exclude]) or r'$.'
includes = r'|'.join([fnmatch.translate(x) for x in files_include]) or r'$.'

df = pd.DataFrame(columns=['probe','sample_tag','subsample_tag','side','instance','fname','relroot'])

for root, dirs, files in os.walk(datapath):
    [dirs.remove(d) for d in list(dirs) if d in datapath_exclude]
    files = [f for f in files if not re.match(excludes, f)]
    files = [f for f in files if re.match(includes, f)]
    for fname in files:
        #if debug: print(root.split('/')[len(datapath.split('/'))-1], fname)
        instance,fname = check_autoscan_fname(fname, root, dryrun=dryrun, debug=debug)
        # determine sample name
        sample_tag = root.split('/')[len(datapath.split('/'))-1]
        # determine if the file is subsample or sample
        subornot = re.findall('subsample',root)
        if len(subornot)==1:
            subornot = subornot[0]
            subsample_tag = root.split('/')[-2]
        else:
            subsample_tag = ''
        side = ''
        if len(fname.split('-'))>2:
            side = fname.split('-')[-1].lower().replace('.csv','')

        df = df.append({
            'probe':re.findall(r'perm|vel|impulse|ftir',fname)[0],
            'sample_tag':sample_tag,
            'subsample_tag':subsample_tag,
            'side':side,
            'instance':instance,
            'fname':fname,
            'relroot':os.path.relpath(os.path.join(root,fname),start=datapath)
        }, ignore_index=True)
df.sort_values(by=['sample_tag','subsample_tag','probe','side'], inplace=True)
df.reset_index(inplace=True, drop=True)

### assign information
get the information from basic info to know the `sample code` and `family`. This information will be used at a later stage for analysis (classification).

In [16]:
df['sample_code'] = df['sample_tag'].apply(lambda x: info.rock_dict[x.split('_')[0]]['code'])
df['sample_family'] = df['sample_tag'].apply(lambda x: info.rock_dict[x.split('_')[0]]['family'])

### save
1. add a `link` column to enable direct access
1. save the csv file without the `link` column

In [21]:
# create link column and save
df['link'] = df['relroot'].apply(lambda x: '<a href="./autoscan/{0}">link</a>'.format(x))
df.to_html(os.path.join('/media/sanromd/data/lab/data/characterization','autoscan.html'), 
           na_rep='-', escape=False)

# save all columns except link
df.loc[:, ['sample_tag', 'subsample_tag','sample_code', 'sample_family',
                'probe', 'side', 'instance', 'fname', 'relroot']].to_csv(os.path.join(datapath,'summary.csv'),
                                                                         index = False)

# Combine & recover information per probe
1. Create 

In [361]:
# define probe of interest
probe = 'perm'
instance = 'before'

In [362]:
# load the summary dataframe with all measurements
df_summary = pd.read_csv(os.path.join(datapath,'summary.csv'))
df_summary['side'] = df_summary['fname'].apply(lambda x: _get_sides(x))
df_probe = df_summary.loc[(df_summary['probe']==probe) & (df_summary['instance']==instance),:]
df_probe = df_probe.drop(columns=['probe','instance']).copy()
df_probe.reset_index(inplace=True, drop=True)

In [540]:
probe_settings = {
    'perm':{
        'usecols':[0,1,2,6,12],
        'skiprows':7,
        'names':['x','y','perm','meas_code','tile'],
        'tip':['perm'],
        'h':3
    },
    'impulse':{
        'usecols':[0,1,2,3],
        'skiprows':7,
        'names':['x','y','e_star','tile'],
        'tip':['e_start'],
        'h':3
    },
    'vel':{
        'usecols':[0,1,3,6,9,10],
        'skiprows':7,
        'names':['x','y','vp','vs','tile','direction'],
        'tip':['vp','vs'],
        'h':4
    },
    'fitr':{
        'usecols':None,
        'skiprows':2,
        'names':None,
        'tip':['ftir'],
        'h':0
    }
    
}
def read_data(probe, fpath):
    df = pd.read_csv(fpath, 
               usecols=probe_settings[probe]['usecols'],
               skiprows=probe_settings[probe]['skiprows'],
               names=probe_settings[probe]['names'])
    return df

In [557]:
df_temp_lines

Unnamed: 0,v,perm_c,meas_code,tile,side,code,family,tag,sub_tag
0,0,1.02348,6,1,,sg,sandstone,wsg_004,plugs
1,50,1.8053,6,2,,sg,sandstone,wsg_004,plugs
2,100,1.80362,6,3,,sg,sandstone,wsg_004,plugs
3,150,1.0779,6,4,,sg,sandstone,wsg_004,plugs


In [554]:
col_name_ordered + probe_settings[probe]['names'][h:]

['v', 'perm_c', 'meas_code', 'tile']

In [549]:
set(col_names).difference(probe_settings[probe]['names'][h:])

{'perm_c', 'v'}

In [558]:
pool  = []
lines = []

for s in df_probe.iterrows():
    root = s[-1]['relroot']
    if not re.findall('analysis|map|heat|postprocess',root): 
        print(root)
        # make the paths necessary to save the data
        samplepath = os.path.sep.join(root.split('/')[:-2])
        # create a _postprocessed directory in each sample to store the corresponding data
        postprocesspath = os.path.join(datapath, s[-1]['sample_tag'],'_postprocessed')
        if not os.path.exists(postprocesspath):
            os.mkdir(postprocesspath)

        # generate fname for files per sample
        fname = '_'.join([probe, instance, str(s[-1]['subsample_tag']), str(s[-1]['side'])])
        fname = fname.replace('_nan','')
        fname = fname.replace('_None','')

        # load the probe data
        fpath = os.path.join(datapath,root)
        dperm = read_data(probe, fpath)
        # get rid of bad measurements (infs, nans, and text)
        dperm = dperm.apply(pd.to_numeric,errors='coerce').dropna()
        dperm.replace(np.inf, np.nan, inplace = True)
        dperm.dropna(inplace = True)
        # reset index 
        dperm.reset_index(inplace = True, drop = True)

        # reset x,y offset to zero
        dperm.iloc[:,:2] = dperm.iloc[:,:2] - dperm.iloc[:,:2].min() 

        # save probe data to postprocess
        dperm.to_csv(os.path.join(postprocesspath,fname + '.csv'), index = False)

        # pool data (we will assume that every point is idependent)
        temp_dict_pool = {
             'side':s[-1]['side'],
             'code':s[-1]['sample_code'],
             'family':s[-1]['sample_family'],
             'tag':s[-1]['sample_tag'], 
             'sub_tag':s[-1]['subsample_tag']
        }

        for col in probe_settings[probe]['names'][2:]:
            temp_dict_pool[col] = dperm[col].values
        
        df_temp_pool = pd.DataFrame(temp_dict_pool)
        df_temp_pool = df_temp_pool.loc[:, probe_settings[probe]['names'][2:] + 
                                        ['side', 'code', 'family', 'tag', 'subtag']]
        
        pool.append(df_temp_pool)
        
        if not re.findall('ztop|zbottom',root):
            h = probe_settings[probe]['h']
            tip = probe_settings[probe]['tip']
            
            # get the slices and check which is the middle one
            if (len(dperm.loc[:,'y'].unique())>1 and len(dperm.loc[:,'x'].unique())>1):
                slice_along = np.int(dperm['x'].max()/dperm['y'].max() >= 1)
                u = ['x','y'][slice_along]
                v = ['x','y'][np.int(not np.bool(slice_along))]
                slices_ini  = dperm.loc[:, u].unique()
                slices_ini.sort()
                if slices_ini.size > 1:
                    median = slices_ini[slices_ini >= slices_ini.max()/2][0]
                else:
                    median = slices_ini[0]
            else:
                if len(dperm.loc[:,'y'].unique())==1: 
                    u = 'y'
                    v = 'x'
                else:
                    u = 'x'
                    v = 'y'
                slices_ini = dperm.loc[:,u].unique()
                median = dperm.loc[:,u].unique()[0]

            # get center data
            temp_list = []
            col_names = []
            
            tip_names = [s+'_c' for s in tip]
            col_names = ['v'] + tip_names + probe_settings[probe]['names'][h:]
            col_name_ordered = ['v'] + tip_names
            
            for col in [v] + probe_settings[probe]['names'][2:]:
                temp_list.append(dperm.loc[dperm.loc[:,u] == median, col].reset_index(drop = True))

            if slices_ini.size>=3:
                for p in tip:
                    for k in slices_ini[[0,-1]]:    
                        temp_list.append(dperm.loc[dperm.loc[:,u] == k,p].reset_index(drop = True))
                col_names = col_names + [p + '_l', p + '_r']
                col_name_ordered = col_name_ordered + [p + '_l', p + '_r']
            
            df_temp_lines = pd.concat(temp_list, axis = 1, ignore_index=True)
            
            df_temp_lines.columns =  col_names

            # re order slices
            col_name_ordered = col_name_ordered + probe_settings[probe]['names'][h:]
            df_temp_lines = df_temp_lines.loc[:, col_name_ordered]

            # add extra information (in case needed for statistical analysis)
            df_temp_lines['side'] = s[-1]['side']
            df_temp_lines['code'] = s[-1]['sample_code']
            df_temp_lines['family'] = s[-1]['sample_family']
            df_temp_lines['tag'] = s[-1]['sample_tag']
            df_temp_lines['sub_tag'] = s[-1]['subsample_tag']

            df_temp_lines.to_csv(os.path.join(postprocesspath, fname + '_lines.csv'), index = False)
            lines.append(df_temp_lines)

# concat and save
df_pool = pd.concat(pool, ignore_index=True)
df_lines = pd.concat(lines, ignore_index=True, sort=False)

df_pool.to_csv(os.path.join(datapath, '_postprocessed', probe + '_pool.csv'), index = False)
df_lines.to_csv(os.path.join(datapath, '_postprocessed', probe + '_lines.csv'), index = False)

ah_001/subsamples/s1/processed/perm_before.csv
bg_003/processed/perm_before_a.csv
bg_003/processed/perm_before_b.csv
bg_003/processed/perm_before_c.csv
bg_003/processed/perm_before_d.csv
bg_003/subsamples/s1/processed/perm_before.csv
bg_004/subsamples/line_01/processed/perm_before.csv
bg_004/subsamples/s1/processed/perm_before.csv
bg_004/subsamples/s2/processed/perm_before.csv
bg_006/processed/perm_before_a.csv
bg_006/processed/perm_before_b.csv


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


bg_006/processed/perm_before_c.csv
bg_006/processed/perm_before_d.csv
bg_007/processed/perm_before_a.csv
bg_007/processed/perm_before_b.csv
bg_007/processed/perm_before_c.csv
bg_007/processed/perm_before_d.csv
bg_008/processed/perm_before_a.csv
bg_008/processed/perm_before_b.csv
bg_008/processed/perm_before_c.csv
bg_008/processed/perm_before_d.csv
bg_009/processed/perm_before_a.csv
bg_009/processed/perm_before_b.csv
bg_009/processed/perm_before_c.csv
bg_009/processed/perm_before_d.csv
bg_009/processed/perm_before_zbottom.csv
bg_009/processed/perm_before_ztop.csv
bg_011/processed/perm_before_a.csv
bg_011/processed/perm_before_b.csv
bg_011/processed/perm_before_c.csv
bg_011/processed/perm_before_d.csv
bg_012/processed/perm_before_a.csv
bg_012/processed/perm_before_b.csv
bg_012/processed/perm_before_c.csv
bg_012/processed/perm_before_d.csv
bg_019/subsamples/s10/processed/perm_before.csv
bg_019/subsamples/s9/processed/perm_before.csv
lssw_005/processed/perm_before_a.csv
lssw_005/processed/

In [486]:
# pool  = []
# lines = []

# for s in df_probe.iterrows():
#     root = s[-1]['relroot']
#     if not re.findall('analysis|map|heat|postprocess',root): 
#         print(root)
#         # make the paths necessary to save the data
#         samplepath = os.path.sep.join(root.split('/')[:-2])
#         # create a _postprocessed directory in each sample to store the corresponding data
#         postprocesspath = os.path.join(datapath, s[-1]['sample_tag'],'_postprocessed')
#         if not os.path.exists(postprocesspath):
#             os.mkdir(postprocesspath)

#         # generate fname for files per sample
#         fname = '_'.join([probe, instance, str(s[-1]['subsample_tag']), str(s[-1]['side'])])
#         fname = fname.replace('_nan','')
#         fname = fname.replace('_None','')

#         # load the probe data
#         fpath = os.path.join(datapath,root)
#         dperm = pd.read_csv(fpath, skiprows=7, usecols=[0,1,2,12], names = ['x','y',probe,'tile'])
#         # get rid of bad measurements (infs, nans, and text)
#         dperm = dperm.apply(pd.to_numeric,errors='coerce').dropna()
#         dperm.replace(np.inf, np.nan, inplace = True)
#         dperm.dropna(inplace = True)
#         # reset index 
#         dperm.reset_index(inplace = True, drop = True)

#         # reset x,y offset to zero
#         dperm.iloc[:,:2] = dperm.iloc[:,:2] - dperm.iloc[:,:2].min() 

#         # save probe data to postprocess
#         dperm.to_csv(os.path.join(postprocesspath,fname + '.csv'), index = False)

#         # pool data (we will assume that every point is idependent)
#         df_temp_pool = pd.DataFrame({probe:dperm[probe].values,
#                                      'meas_code':dperm['meas_code']
#                                      'tile':dperm['tile'].values,
#                                      'side':s[-1]['side'],
#                                      'code':s[-1]['sample_code'],
#                                      'family':s[-1]['sample_family'],
#                                      'tag':s[-1]['sample_tag'], 
#                                      'sub_tag':s[-1]['subsample_tag']})
#         if probe == 'perm':
        
#         temp_dict_pool = {}
#         for col in probe_settings[probe]['names'][2:]:
#             temp_dict_pool[col] = dperm[col].values
#         pool.append(df_temp_pool)
        
#         if not re.findall('ztop|zbottom',root):
#             # get the slices and check which is the middle one
#             if (len(dperm.loc[:,'y'].unique())>1 and len(dperm.loc[:,'x'].unique())>1):
#                 slice_along = np.int(dperm['x'].max()/dperm['y'].max() >= 1)
#                 u = ['x','y'][slice_along]
#                 v = ['x','y'][np.int(not np.bool(slice_along))]
#                 slices_ini  = dperm.loc[:, u].unique()
#                 slices_ini.sort()
#                 if slices_ini.size > 1:
#                     median = slices_ini[slices_ini >= slices_ini.max()/2][0]
#                 else:
#                     median = slices_ini[0]
#             else:
#                 if len(dperm.loc[:,'y'].unique())==1: 
#                     u = 'y'
#                     v = 'x'
#                 else:
#                     u = 'x'
#                     v = 'y'
#                 slices_ini = dperm.loc[:,u].unique()
#                 median = dperm.loc[:,u].unique()[0]

#             # get center data
#             df_temp_lines = pd.concat([
#                 dperm.loc[dperm.loc[:,u] == median, v].reset_index(drop = True),
#                 dperm.loc[dperm.loc[:,u] == median,probe].reset_index(drop = True),
#                 dperm.loc[dperm.loc[:,u] == median,'tile'].reset_index(drop = True)],
#                 axis = 1, ignore_index=True)
#             df_temp_lines.columns = ['v',probe + '_c', 'tile']
            
#             # add edges
#             if slices_ini.size>=3:
#                 df_temp_lines = pd.concat([df_temp_lines,
#                         dperm.loc[dperm.loc[:,u] == slices_ini[0],probe].reset_index(drop = True),
#                         dperm.loc[dperm.loc[:,u] == slices_ini[-1],probe].reset_index(drop = True)],
#                         axis = 1, ignore_index = True)

#                 df_temp_lines.columns = ['v',probe + '_c', 'tile', probe+ '_l', probe + '_r']

#             # re order slices
#             df_temp_lines = df_temp_lines.loc[:,['v', probe+'_c', probe+'_l',probe+'_r','tile']]

#             # add extra information (in case needed for statistical analysis)
#             df_temp_lines['side'] = s[-1]['side']
#             df_temp_lines['code'] = s[-1]['sample_code']
#             df_temp_lines['family'] = s[-1]['sample_family']
#             df_temp_lines['tag'] = s[-1]['sample_tag']
#             df_temp_lines['sub_tag'] = s[-1]['subsample_tag']

#             df_temp_lines.to_csv(os.path.join(postprocesspath, fname + '_lines.csv'), index = False)
#             lines.append(df_temp_lines)

# # concat and save
# df_pool = pd.concat(pool, ignore_index=True)
# df_lines = pd.concat(lines, ignore_index=True)

# df_pool.to_csv(os.path.join(datapath, '_postprocessed', probe + '_pool.csv'), index = False)
# df_lines.to_csv(os.path.join(datapath, '_postprocessed', probe + '_lines.csv'), index = False)

ah_001/subsamples/s1/processed/perm_before.csv
bg_003/processed/perm_before_a.csv
bg_003/processed/perm_before_b.csv
bg_003/processed/perm_before_c.csv
bg_003/processed/perm_before_d.csv
bg_003/subsamples/s1/processed/perm_before.csv
bg_004/subsamples/line_01/processed/perm_before.csv
bg_004/subsamples/s1/processed/perm_before.csv
bg_004/subsamples/s2/processed/perm_before.csv
bg_006/processed/perm_before_a.csv
bg_006/processed/perm_before_b.csv
bg_006/processed/perm_before_c.csv
bg_006/processed/perm_before_d.csv
bg_007/processed/perm_before_a.csv
bg_007/processed/perm_before_b.csv
bg_007/processed/perm_before_c.csv
bg_007/processed/perm_before_d.csv
bg_008/processed/perm_before_a.csv
bg_008/processed/perm_before_b.csv
bg_008/processed/perm_before_c.csv
bg_008/processed/perm_before_d.csv
bg_009/processed/perm_before_a.csv
bg_009/processed/perm_before_b.csv
bg_009/processed/perm_before_c.csv
bg_009/processed/perm_before_d.csv
bg_009/processed/perm_before_zbottom.csv
bg_009/processed/pe

In [482]:
df_probe.loc[df_probe['sample_tag']=='bg_003',:]

Unnamed: 0,sample_tag,subsample_tag,sample_code,sample_family,side,fname,relroot
1,bg_003,,bg,sandstone,a,perm_before_a.csv,bg_003/processed/perm_before_a.csv
2,bg_003,,bg,sandstone,b,perm_before_b.csv,bg_003/processed/perm_before_b.csv
3,bg_003,,bg,sandstone,c,perm_before_c.csv,bg_003/processed/perm_before_c.csv
4,bg_003,,bg,sandstone,d,perm_before_d.csv,bg_003/processed/perm_before_d.csv
5,bg_003,s1,bg,sandstone,,perm_before.csv,bg_003/subsamples/s1/processed/perm_before.csv


In [487]:
pd.read_csv?

In [311]:
# df_temp_lines = dperm.loc[
#     dperm.iloc[:,slice_along] == median,:].iloc[:,[np.int(not np.bool(slice_along)),2,3]].copy()
# df_temp_lines.reset_index(inplace = True, drop=True)
# # rename slice column to avoid conflicts in the future
# df_temp_lines.columns = ['u', probe, 'tile']

In [374]:
dperm['tile'].unique(

SyntaxError: unexpected EOF while parsing (<ipython-input-374-ab35ff04ca3f>, line 1)

In [312]:
# check edges
vals_index = dperm.iloc[:,slice_along] == median
df_temp_lines = pd.DataFrame({
    probe+'_center':dperm.loc[vals_index,probe].values,
    'u':dperm.loc[vals_index,dperm.columns[np.int(not np.bool(slice_along))]].values,
    'tile':dperm.loc[vals_index,'tile'].values})

# add edges
if slices_ini.size>=3:
    df_temp_lines.loc[:,probe+'_left'] = dperm.loc[dperm.iloc[:,slice_along] == slices_ini[0],probe].values
    df_temp_lines.loc[:,probe+'_right'] = dperm.loc[dperm.iloc[:,slice_along] == slices_ini[-1],probe].values

# re order slices
df_temp_lines = df_temp_lines.loc[:,['u', probe+'_center', probe+'_left',probe+'_right','tile']]