In [10]:
import os, sys, re
import numpy as np
import pandas as pd
from pathlib import Path
from functools import reduce
from pandas import HDFStore,DataFrame
# define the current path (notebooks in lab_utils)
currpath = os.getcwd()
labutilspath = str(Path(currpath).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

def duplicated_varnames(df):
    """Return a dict of all variable names that 
    are duplicated in a given dataframe.
    https://stackoverflow.com/questions/26226343/pandas-concat-yields-valueerror-plan-shapes-are-not-aligned
    """
    repeat_dict = {}
    var_list = list(df) # list of varnames as strings
    for varname in var_list:
        # make a list of all instances of that varname
        test_list = [v for v in var_list if v == varname] 
        # if more than one instance, report duplications in repeat_dict
        if len(test_list) > 1: 
            repeat_dict[varname] = len(test_list)
    return repeat_dict

In [11]:
# define paths
basepath = '/home/urlab/sandbox/data/characterization/'

# set the asdatapath accordingly (where is the atuoscan data inside basedatapath?)
asdatapath   = 'autoscan'

# set datapath
datapath = os.path.join(basepath, asdatapath)

pp = autoscan.postprocess(labutilspath = labutilspath)
pp.debug = False

In [20]:
# load the summary dataframe with all measurement
summary = pd.read_csv(os.path.join(datapath,'summary.csv'))
# remove ztop and zbottom
t = summary['relroot'].apply(lambda x: len(re.findall(r'ztop|zbottom', x))==0)
summary = summary.loc[t,:].copy()

# sort
sortcols = ['sample_tag','subsample_tag', 'side', 'instance', 'probe']
summary.sort_values(by = sortcols, ignore_index = True, inplace = True)

# print summary head 
summary.head(-5)

Unnamed: 0,sample_tag,subsample_tag,sample_code,sample_family,probe,side,instance,fname,relroot
0,ah_001,s1,ah,carbonate,ftir,,after,ftir_after.csv,ah_001/subsamples/s1/processed/ftir_after.csv
1,ah_001,s1,ah,carbonate,perm,,after,perm_after.csv,ah_001/subsamples/s1/processed/perm_after.csv
2,ah_001,s1,ah,carbonate,vel,,after,vel_after.csv,ah_001/subsamples/s1/processed/vel_after.csv
3,ah_001,s1,ah,carbonate,ftir,,before,ftir_before.csv,ah_001/subsamples/s1/processed/ftir_before.csv
4,ah_001,s1,ah,carbonate,perm,,before,perm_before.csv,ah_001/subsamples/s1/processed/perm_before.csv
...,...,...,...,...,...,...,...,...,...
637,wsg_003,,sg,sandstone,vel,,before,vel_before.csv,wsg_003/subsamples/S2/processed/vel_before.csv
638,wsg_003,,sg,sandstone,vel,,before,vel_before.csv,wsg_003/subsamples/S3/processed/vel_before.csv
639,wsg_003,,sg,sandstone,vel,,before,vel_before.csv,wsg_003/subsamples/S4/processed/vel_before.csv
640,wsg_003,,sg,sandstone,vel,,before,vel_before.csv,wsg_003/subsamples/S5/processed/vel_before.csv


In [6]:
# get some info for a sample
instance = 'before'
tag = 'wsg_004'
sample_info = pp._subset_info(summary, instance = instance, sample_tag = tag)
sample_info.head()

Unnamed: 0,subsample_tag,sample_code,sample_family,probe,side,fname,relroot
0,plugs,sg,sandstone,ftir,1.0,ftir_before_1.csv,wsg_004/subsamples/plugs/processed/ftir_before...
1,plugs,sg,sandstone,ftir,2.0,ftir_before_2.csv,wsg_004/subsamples/plugs/processed/ftir_before...
2,plugs,sg,sandstone,ftir,3.0,ftir_before_3.csv,wsg_004/subsamples/plugs/processed/ftir_before...
3,plugs,sg,sandstone,ftir,4.0,ftir_before_4.csv,wsg_004/subsamples/plugs/processed/ftir_before...
4,plugs,sg,sandstone,perm,,perm_before.csv,wsg_004/subsamples/plugs/processed/perm_before...


In [None]:
# dfs = []
# for s in sample_info.itertuples(index=False):
#     relpath = getattr(s, 'relroot')
#     probe   = getattr(s, 'probe')
#     data = pp.read_data(os.path.join(datapath, relpath), probe = probe)
#     h = pp.probe_settings[probe]['h']
#     data = data.iloc[:,:h]
#     if probe == 'vel':
#         for x in data.angle.unique():
#             dfs.append(data.loc[data['angle'] == x,:].copy().drop(columns = 'angle'))
#     else:
#         dfs.append(data)

# df_merged = reduce(lambda left,right: pd.merge(left,right,on=['x','y'], how ='inner'), dfs)
# df_merged = pp._enforce_float(df_merged)
# df_merged['tag'] = tag
# df_merged['instance'] = instance

In [None]:
save = False
fullset = []
problem = {}
problem_merged = []
ns = 0
for instace in summary.instance.unique():
    for tag in summary['sample_tag'].unique():
        s1 = pp._subset_info(summary, instance = instance, sample_tag = tag)
        if len(s1)>0:
            family = s1['sample_family'].unique()[0]
            code   = s1['sample_code'].unique()[0]
            for subtag in s1['subsample_tag'].unique():
                s2 = pp._subset_info(s1, subsample_tag = subtag, sample_code = code, sample_family = family)
                for kside, side in enumerate(s2['side'].unique()):
                    s3 = pp._subset_info(s2, side = side)
                    probes  = s3['probe'].unique()
                    nprobes = len(probes)
                    if nprobes>=1 and ('ftir' in probes):
                        dfs = []
                        for meas in s3.itertuples(index=False):
                            relpath  = getattr(meas, 'relroot')
                            filepath = os.path.join(datapath, relpath)
                            probe    = getattr(meas, 'probe')
                            
                            data     = pp.read_data(filepath, probe = probe)
                            
                            h    = pp.probe_settings[probe]['h']
                            data = data.iloc[:,:h]

                            if probe == 'vel':
                                for k,x in enumerate(data.angle.unique()):
                                    colnames = ['vp_'+str(k), 'vs_'+str(k)]
                                    data_temp = data.loc[data['angle'] == x,:].copy().drop(columns = 'angle')
                                    data_temp.columns = ['x','y'] + colnames
                                    dfs.append(data_temp)
                                    del data_temp
                            else:
                                dfs.append(data)

                        df_merged = reduce(lambda left,right: pd.merge(left,right,on=['x','y'], how ='inner'), dfs)
                        df_merged = pp._enforce_float(df_merged)
                        if df_merged.shape[0]>0 and len(df_merged.iloc[:,1754:].columns.values)>=0:
                            ns += df_merged.shape[0]
                            df_merged['family']   = family
                            df_merged['code']     = code
                            df_merged['tag']      = tag
                            df_merged['subtag']   = subtag
                            df_merged['instance'] = instance
                            df_merged['side']     = kside
                            
                            if save:
                                outpath = pp._set_outpath(datapath,tag)
                                outname = pp._set_outfilename(tag, subtag, side, instance, 'fullset')
                                pp.save_data(df_merged, savepath = outpath,  savename = outname)
                        
                            fullset.append(df_merged.iloc[:,2:])

In [None]:
# check if there are any repeated variables
test = [print(k, dt.loc[0,['tag', 'subtag','instance', 'side']], sep = '\n') for k, dt in enumerate(fullset) if len(duplicated_varnames(dt).items())>0]
if len(test)==0:
    print('all feature names are unique')

In [None]:
compiledinfocols = ['family', 'code', 'tag', 'subtag', 'side', 'instance']
compileddatacols = pp.probe_settings['ftir']['names'][2:] + ['perm', 'vp_0', 'vs_0', 'e_star']  
compiledfullcols = compiledinfocols + compileddatacols

dfc = pd.concat(fullset, sort = False, axis = 0, ignore_index=True, join = 'outer')
dfc = dfc.loc[:,compiledfullcols].copy()
dfc.loc[:,compileddatacols] = dfc.loc[:,compileddatacols].apply(pd.to_numeric,errors='coerce').copy()

In [None]:
dfc.instance.unique()
# savefile = os.path.join(datapath, 'as_dataset.h5')
# dfc.to_hdf(savefile, key = 'data', mode = 'w')

In [None]:
# import deepdish as dp
# dp.io.save(os.path.join(datapath, 'compiled.h5'), dfc)
# dp.io.save(os.path.join(datapath, 'data.h5'), dfc.loc[:, compileddatacols])
# dp.io.save(os.path.join(datapath, 'descriptions.h5'), dfc.loc[:, compiledinfocols])

dfc.to_hdf(os.path.join(datapath, 'autoscan.h5'), key = 'df', mode = 'w', data_columns = True)

In [None]:
# dp.io.save(os.path.join(datapath, 'descriptions.h5'), dfc.loc[:, compiledinfocols])

In [None]:
# dp.io.save(os.path.join(datapath, 'compiled.h5'), dfc)

In [None]:
pp._subset_info(summary, instance = 'after', sample_tag = 'ah_001')

In [13]:
for k, s in summary.iterrows():
    df = []
    h  = pp.probe_settings[s.probe]['h']
    dp = pp.read_data(os.path.join(datapath, s.relroot), probe = s.probe).iloc[:, :h]
    
    if s.probe == 'vel':
        for k,x in enumerate(dp.angle.unique()):
            colnames = ['vp_'+str(k), 'vs_'+str(k)]
            data_temp = dp.loc[dp['angle'] == x,:].copy().drop(columns = 'angle')
            data_temp.columns = ['x','y'] + colnames
            df.append(data_temp)
            del data_temp
    else:
        df.append(dp)

In [None]:
compiledinfocols = ['family', 'code', 'tag', 'subtag', 'side', 'instance']
compileddatacols = ['x', 'y'] + pp.probe_settings['ftir']['names'][2:] + ['perm', 'vp_0', 'vs_0', 'e_star']  
compiledfullcols = compiledinfocols + compileddatacols

In [None]:
db = pd.DataFrame(columns = compiledfullcols)

In [96]:
summary.replace(np.nan, '', inplace = True)
sold = summary.loc[0, sortcols[:-1]]
df = []
xy_old = [0,0,0,0]
for k, s in summary.iterrows():
    snew = s[sortcols[:-1]]
    h  = pp.probe_settings[s.probe]['h']
    dp = pp.read_data(os.path.join(datapath, s.relroot), probe = s.probe).iloc[:, :h]
    xy = [dp.x.min(), dp.x.max(), dp.y.min(), dp.y.max()]
    if s.probe == 'vel':
        for k,x in enumerate(dp.angle.unique()):
            colnames = ['vp_'+str(k), 'vs_'+str(k)]
            data_temp = dp.loc[dp['angle'] == x,:].copy().drop(columns = 'angle')
            data_temp.columns = ['x','y'] + colnames
            df.append(data_temp)
            del data_temp
    else:
        df.append(dp)
    
    if not (snew == sold).all():
#         print(s.values[:-2])
#         print(xy)
#         print((xy == xy_old))
#         xy_old = xy
        print('\n' + s.sample_tag )
        sold = snew
        xy_merged = reduce(lambda left,right: pd.merge(left,right,on=['x','y'], how ='inner'), df)
        nn_merged = reduce(lambda left,right: pd.merge(left,right, how ='inner'), df)
        xy_merged = pp._enforce_float(xy_merged)
        nn_merged = pp._enforce_float(nn_merged)
        xy_merged = pd.concat([db, xy_merged], how = 'outer')
        nn_merged = pd.concat([db, nn_merged], how = 'outer')
        print(xy_merged.shape, nn_merged.shape, sep = '\t')
        df = []
        


ah_001


TypeError: concat() got an unexpected keyword argument 'how'

In [93]:
xy_merged.shape

(42, 1759)

In [95]:
xy_merged

Unnamed: 0,x,y,perm,vp_0,vs_0,vp_1,vs_1,l_1,l_2,l_3,...,l_1743,l_1744,l_1745,l_1746,l_1747,l_1748,l_1749,l_1750,l_1751,l_1752
0,0.0,0.0,1.83072,5213.829061,4984.098353,5392.289027,4456.800866,1.85128,1.95237,2.08853,...,1.62411,1.62049,1.61629,1.61235,1.61101,1.61206,1.61238,1.60983,1.60636,1.60491
1,0.0,25.0,1.78031,5216.743262,4978.308797,5418.935308,4436.979051,2.0684,2.19317,2.30594,...,1.59696,1.59488,1.59261,1.58812,1.584,1.58293,1.58359,1.58392,1.58219,1.57918
2,0.0,50.0,1.82352,5219.660722,4972.532677,5445.846246,4417.332772,2.0424,2.17066,2.29367,...,1.58864,1.58685,1.58395,1.58058,1.57828,1.57737,1.5766,1.5757,1.57476,1.57253
3,0.0,75.0,1.79223,5222.581447,4966.769944,5472.954483,4397.859708,2.1144,2.22543,2.31573,...,1.59402,1.59013,1.58808,1.58685,1.5848,1.5829,1.5819,1.58055,1.57824,1.57643
4,0.0,100.0,1.8512,5225.44043,4961.020553,5342.288026,4378.557578,2.28521,2.36577,2.40584,...,1.61062,1.60678,1.60429,1.60322,1.60061,1.59835,1.59945,1.60078,1.59969,1.59796
5,0.0,125.0,1.79875,5228.367629,4955.284457,5342.288026,4359.424141,1.82551,1.91748,1.99916,...,1.62571,1.62287,1.62055,1.61826,1.61516,1.61241,1.61109,1.61135,1.61093,1.60866
6,25.0,0.0,1.82669,5470.032039,4351.700271,5410.418921,4339.515421,2.02195,2.23011,2.34953,...,1.59312,1.58962,1.5877,1.58546,1.58159,1.57968,1.58107,1.58239,1.58089,1.57755
7,25.0,25.0,1.78959,5429.653666,4315.304948,5510.870849,4494.766807,1.86303,1.92157,2.01884,...,1.58697,1.58209,1.57891,1.57792,1.57666,1.57491,1.57398,1.57297,1.57075,1.56881
8,25.0,50.0,1.76616,5389.936219,4750.593824,5429.443094,4408.985933,2.12032,2.38714,2.16762,...,1.60327,1.59895,1.59703,1.59495,1.59074,1.58951,1.59043,1.58827,1.58529,1.58427
9,25.0,75.0,1.82039,5404.709819,4242.638517,5350.386629,4303.014159,2.16046,2.20069,2.32882,...,1.57641,1.57136,1.56767,1.56476,1.56276,1.56235,1.56167,1.56022,1.55913,1.55792


In [61]:
# h  = pp.probe_settings[s.probe]['h']
# pp.read_data(os.path.join(datapath, s.relroot), probe = s.probe).iloc[:, :h]

[]