In [None]:
%matplotlib inline
import os, shutil, warnings, string
import pandas as pd
import numpy as np
import re
import fnmatch

In [None]:
import sys
labutilspath = '/sandbox/dev/lab_utils/'
sys.path.append(labutilspath)
from _helpers.basics import basic_info
info = basic_info()

In [None]:
datapath = '/media/sanromd/data/lab/data/characterization/autoscan/'
datapath_processed = 'processed'
datapath_raw       = 'raw'
datapath_generic   = '_generic*'
datapath_fluids    = '_fluids'
datapath_analysis  = '_analysis'
datapath_exclude   = ['_special-studies', 'special_studies', '_special_studies', '_unsorted',
                      datapath_raw, datapath_generic, datapath_analysis, '*layout*', '_postprocessed']
files_exclude      = ['.*','_*','*.asd','*.tcl', 'summary*','*map*']
files_include      = ['*.csv']

# List and correct filenames (enforce patterns)

1. Read the _csv_ files in the `datapath`
1. Find whether each file has the word `before` or `after`
1. If not, then add `before` as default
1. Fix all file names such that they follow `probe-before|after-side.csv` 
1. Create a dataframe with a list of all measurements

## file handling functions

In [None]:
probe_settings = {
    'perm':{
        'usecols':[0,1,2,6,12],
        'skiprows':7,
        'names':['x','y','perm','meas_code','tile'],
        'tip':['perm'],
        'h':3
    },
    'impulse':{
        'usecols':[0,1,2,3],
        'skiprows':7,
        'names':['x','y','e_star','tile'],
        'tip':['e_star'],
        'h':3
    },
    'vel':{
        'usecols':[0,1,3,6,9,10],
        'skiprows':7,
        'names':['x','y','vp','vs','tile','direction'],
        'tip':['vp','vs'],
        'h':4
    },
    'ftir':{
        'usecols':None,
        'skiprows':0,
        'names':['x','y']+['l_'+str(int(x)) for x in np.linspace(1,1752,1752)],
        'tip':['l_'+str(int(x)) for x in np.linspace(1,1752,1752)],
        'h':0
    }
    
}

def _rename_file(root,fname_old, fname_new):
    oldname = os.path.join(root,fname_old)
    newname = os.path.join(root,fname_new)
    shutil.move(oldname,newname)
    return

def _get_refindall(pattern, string):
    v = None
    s = re.findall(pattern, string)
    if len(s)>0:
        v = s[0]
    return v

def _get_sides(x):
    side = _get_refindall(r'(before|after)_([a-z]+|[0-9]+)[.]', x)
    if side is not None:
        side = side[-1]
    return side

def _get_subsample(x):
    sub = _get_refindall(r'.*sub[a-z]+[/]([a-z]+[0-9]+)[/]',x)
    return sub

def _get_probename(x):
    probe = _get_refindall(r'(perm|vel|impulse|ftir)',x)
    return probe

def _get_instance(x):
    instance = _get_refindall(r'(before|after)', x)
    return instance

def _get_rockinfo(x, key=None):
    s = info.rock_dict[x.split('_')[0]][key]
    return s

def add_before_fname(fname,root,dryrun=False, debug=False):
    name2 = re.sub(r'(perm|vel|impulse|ftir).*([a-z]+)[.]([a-z]+)',r'\1_before_\2.\3',fname)
    if debug: print(root.split('/')[len(datapath.split('/'))-1], fname, name2, sep='\t')
    if not dryrun:
        _rename_file(root, fname, name2)
    return name2

def swap_instance_fname(fname,root,dryrun=False, debug=False):
    name2 = re.sub(r'(perm|vel|impulse|ftir)(_|-)([a-z]+|[0-9]+)(_|-).*(before|after).*[.]([a-z]+)',
                   r'\1_\5_\3.\6',
                   fname)
    if debug: print('swaping', root.split('/')[len(datapath.split('/'))-1], fname,name2,sep='\t')
    if not dryrun:
        _rename_file(root, fname, name2)
    return name2

def check_autoscan_fname(fname, root, dryrun=False, debug=False):
    instance = None
    if (not 'before' in fname) and (not 'after' in fname):
        warnings.warn('before or after not found in ' + 
                      root.split('/')[len(datapath.split('/'))-1] + fname)
        fname = add_before_fname(fname, root, dryrun=dryrun, debug=debug)
    instance = re.findall('before|after',fname)
    if len(instance)==1:
        instance = instance[0]
        if debug: print(root.split('/')[len(datapath.split('/'))-1], instance, fname, sep='\t')
        if len(fname.split('_'))>2:
            tst = re.match(r'(perm|vel|impulse|ftir)(-|_)(before|after).*([a-z]+)[.]([a-z]+)', fname)
            if tst is None:
                fname = swap_instance_fname(fname, root, dryrun=dryrun, debug=debug)
    else:
        if len(instance)>1:
            warning = fname + ' has more than one instance: ' + ', '.join(instance)
            warning = warning + ' and cannot choose! \n check ' + os.path.join(root,fname) 
        if len(instance)==0:
            warning = fname + ' not in either category (!) \n please review!'
        warnings.warn(warning)
    return fname

def _vel_direction(x):
    d = -1
    if x == 'velax':
        d = 1
    return d

def read_data(probe, fpath, reset_offset = True, save = False, savepath = './', savename = None):
    df = pd.read_csv(fpath, 
               usecols=probe_settings[probe]['usecols'],
               skiprows=probe_settings[probe]['skiprows'],
               names=probe_settings[probe]['names'])
    
    if probe == 'vel':
        df['direction'] = df['direction'].apply(lambda x: _vel_direction(x))
    # get rid of bad measurements (infs, nans, and text)
    df = df.apply(pd.to_numeric,errors='coerce').dropna()
    df.replace(np.inf, np.nan, inplace = True)
    df.dropna(inplace = True)
    # reset index 
    df.reset_index(inplace = True, drop = True)
    
    # reset x,y offset to zero
    if reset_offset:
        df.iloc[:,:2] = df.iloc[:,:2] - df.iloc[:,:2].min()
    
    # save data if needed
    if save:
        if savename is None: savename = probe + '.csv'
        df.to_csv(os.path.join(savepath,savename), index = False)   
    
    return df

## file wrangling & save

In [None]:
dryrun = True
debug  = False

excludes = r'|'.join([fnmatch.translate(x) for x in files_exclude]) or r'$.'
includes = r'|'.join([fnmatch.translate(x) for x in files_include]) or r'$.'

fs = []
rs = []
for root, dirs, files in os.walk(datapath):
    [dirs.remove(d) for d in list(dirs) if d in datapath_exclude]
    files = [f for f in files if not re.match(excludes, f)]
    files = [f for f in files if re.match(includes, f)]
    for fname in files:
        fname = check_autoscan_fname(fname, root, dryrun=dryrun, debug=debug)
        fs.append(fname)
        rs.append(os.path.relpath(os.path.join(root,fname),start=datapath))

df = pd.DataFrame({'fname':fs, 'relroot':rs})
df['sample_tag']    = df['relroot'].apply(lambda x: x.split('/')[0])
df['subsample_tag'] = df['relroot'].apply(_get_subsample)

df = pd.concat([df, 
                df.fname.apply(lambda s: pd.Series({'probe':_get_probename(s), 
                                                    'side':_get_sides(s), 
                                                    'instance':_get_instance(s)})),
          ],
          axis = 1, sort = False)

for s in ['code', 'family']:
    df['sample_' + s] = df['sample_tag'].apply(_get_rockinfo, key=s)

df = df.loc[:, ['sample_tag', 'subsample_tag', 'side', 'sample_code', 'sample_family',
                'probe', 'instance', 'fname', 'relroot']]

### save
1. add a `link` column to enable direct access
1. save the csv file without the `link` column

In [None]:
# create link column and save
df['link'] = df['relroot'].apply(lambda x: '<a href="./autoscan/{0}">link</a>'.format(x))
df.to_html(os.path.join('/media/sanromd/data/lab/data/characterization','autoscan.html'), 
           na_rep='-', escape=False)

# save all columns except link
df.loc[:, ['sample_tag', 'subsample_tag','sample_code', 'sample_family',
                'probe', 'side', 'instance', 'fname', 'relroot']].to_csv(os.path.join(datapath,'summary.csv'),
                                                                         index = False)

# Combine & recover information per probe

In [None]:
# load the summary dataframe with all measurements
df_summary = pd.read_csv(os.path.join(datapath,'summary.csv'))

In [None]:
# remove ztop and zbottom
t = df_summary['relroot'].apply(lambda x: len(re.findall(r'ztop|zbottom', x))==0)
df_summary = df_summary.loc[t,:].copy()

In [None]:
# define probe of interest
probe = 'ftir'
instance = 'before'

df_probe = df_summary.loc[(df_summary['probe']==probe) & (df_summary['instance']==instance),:].copy()
df_probe = df_probe.drop(columns=['probe','instance']).copy()
df_probe.reset_index(inplace=True, drop=True)

pool  = []
lines = []

for s in df_probe.iterrows():
    root = s[-1]['relroot']
    print(root)
    # make the paths necessary to save the data
    samplepath = os.path.sep.join(root.split('/')[:-2])
    # create a _postprocessed directory in each sample to store the corresponding data
    postprocesspath = os.path.join(datapath, s[-1]['sample_tag'],'_postprocessed')
    if not os.path.exists(postprocesspath):
        os.mkdir(postprocesspath)

    # generate fname for files per sample
    fname = '_'.join([probe, instance, str(s[-1]['subsample_tag']), str(s[-1]['side'])])
    fname = fname.replace('_nan','')
    fname = fname.replace('_None','')

    # load the probe data
    fpath = os.path.join(datapath,root)
    dperm = read_data(probe, fpath, reset_offset=True, 
                      save=True, savepath=postprocesspath, savename=fname+'.csv')
    
    if len(dperm)>0:
        # pool data (we will assume that every point is idependent)
        temp_dict_pool = {
             'side':s[-1]['side'],
             'code':s[-1]['sample_code'],
             'family':s[-1]['sample_family'],
             'tag':s[-1]['sample_tag'], 
             'sub_tag':s[-1]['subsample_tag']
        }

        for col in probe_settings[probe]['names'][2:]:
            temp_dict_pool[col] = dperm[col].values

        df_temp_pool = pd.DataFrame(temp_dict_pool)
        df_temp_pool = df_temp_pool.loc[:, probe_settings[probe]['names'][2:] + 
                                        ['side', 'code', 'family', 'tag', 'subtag']]

        pool.append(df_temp_pool)

        h = probe_settings[probe]['h']
        tip = probe_settings[probe]['tip']

        # get the slices and check which is the middle one
        if (len(dperm.loc[:,'y'].unique())>1 and len(dperm.loc[:,'x'].unique())>1):
            slice_along = np.int(dperm['x'].max()/dperm['y'].max() >= 1)
            u = ['x','y'][slice_along]
            v = ['x','y'][np.int(not np.bool(slice_along))]
            slices_ini  = dperm.loc[:, u].unique()
            slices_ini.sort()
            if slices_ini.size > 1:
                median = slices_ini[slices_ini >= slices_ini.max()/2][0]
            else:
                median = slices_ini[0]
        else:
            if len(dperm.loc[:,'y'].unique())==1: 
                u = 'y'
                v = 'x'
            else:
                u = 'x'
                v = 'y'
            slices_ini = dperm.loc[:,u].unique()
            median = dperm.loc[:,u].unique()[0]

        # get center data
        temp_list = []
        col_names = []

        tip_names = [s+'_c' for s in tip]
        col_names = ['v'] + tip_names 
        if not probe=='ftir':
            col_names = col_names + probe_settings[probe]['names'][h:]
        col_name_ordered = ['v'] + tip_names

        for col in [v] + probe_settings[probe]['names'][2:]:
            temp_list.append(dperm.loc[dperm.loc[:,u] == median, col].reset_index(drop = True))

        if slices_ini.size>=3:
            for p in tip:
                for k in slices_ini[[0,-1]]:    
                    temp_list.append(dperm.loc[dperm.loc[:,u] == k,p].reset_index(drop = True))
                col_names = col_names + [p + '_l', p + '_r']
                col_name_ordered = col_name_ordered + [p + '_l', p + '_r']

        df_temp_lines = pd.concat(temp_list, axis = 1, ignore_index=True)

        df_temp_lines.columns =  col_names

        # re order slices
        if not probe=='ftir':
            col_name_ordered = col_name_ordered + probe_settings[probe]['names'][h:]
        df_temp_lines = df_temp_lines.loc[:, col_name_ordered]

        # add extra information (in case needed for statistical analysis)
        df_temp_lines['side'] = s[-1]['side']
        df_temp_lines['code'] = s[-1]['sample_code']
        df_temp_lines['family'] = s[-1]['sample_family']
        df_temp_lines['tag'] = s[-1]['sample_tag']
        df_temp_lines['sub_tag'] = s[-1]['subsample_tag']

        df_temp_lines.to_csv(os.path.join(postprocesspath, fname + '_lines.csv'), index = False)
        lines.append(df_temp_lines)

# concat and save
df_pool = pd.concat(pool, ignore_index=True)
df_lines = pd.concat(lines, ignore_index=True, sort=False)

df_pool.to_csv(os.path.join(datapath, '_postprocessed', probe + '_pool.csv'), index = False)
df_lines.to_csv(os.path.join(datapath, '_postprocessed', probe + '_lines.csv'), index = False)