In [None]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path

# define the current path (notebooks in lab_utils)
currpath = os.getcwd()
labutilspath = str(Path(currpath).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

In [None]:
# define paths
basepath = '/sandbox/data/'

# set the asdatapath accordingly (where is the atuoscan data inside basedatapath?)
asdatapath   = 'autoscan'

# set datapath
datapath = os.path.join(basepath, asdatapath)

pp = autoscan.postprocess(labutilspath = labutilspath)
pp.debug = False

In [None]:
# load the summary dataframe with all measurement
summary = pd.read_csv(os.path.join(datapath,'summary.csv'))

# remove ztop and zbottom
# t = summary['relroot'].apply(lambda x: len(re.findall(r'ztop|zbottom', x))==0)
# summary = summary.loc[t,:].copy()

# shortern column's names
summary.rename(columns = pp.info_columns_short, inplace = True)

# set side to a numeric value (this makes it easy to save data and avoid performance hits due to dtypes mismatchs)
summary.side.fillna('a', inplace = True)
summary.side = summary.side.apply(pp._val_replace)

# set experiment to default values instead of nan
summary.experiment.fillna('before', inplace = True)
# set s0 as the default subsample when none is defined
summary.subtag.replace(np.nan, 's0', inplace = True) 

# sort the columns in an specific order
sortcols = ['tag','subtag', 'instance', 'experiment','side', 'probe']
summary.sort_values(by = sortcols, ignore_index = True, inplace = True)

fdesc = summary.set_index(['family', 'code'] + sortcols[:-1]).loc[:, ['probe', 'relroot']].copy()
fdesc = pp._fix_repeated_probes(fdesc, debug = False)

print('\n####### summary #######\n', fdesc.describe().iloc[[0, 1, 3], :], '\n', sep = '\n')
# fdesc.info(verbose = False)
N = fdesc.index.nunique(dropna = False)

print('number of unique samples: %d' % (N))

In [None]:
dfl = []
# db = pp.empty_dataframe
key_names = fdesc.index.names + ['m']
fdesc['loaded'] = False
pp.sample_data_index_names = list(fdesc.index.names) + ['m']

for k, r in enumerate(fdesc.index.unique()):
    ftest = fdesc.loc[[r], :]
    print("loaded: %2.2f" % (100 * k / N), r, sep = '\t', end = '\r')
    try:
        df = pp.read_sample_data(ftest, datapath = datapath,
                                 na_values = ['None', 'mm', 'cm^-1', 'cm', 'mD', 'Pa', 'm/s', 'm', 's', 'degrees'],
                                 infer_row_settings = True)
        dfl.append(df)
        del df
        fdesc.loc[[r], 'loaded'] = True
    except Exception as e: 
        print(e)
        print('\n\ncheck ', fdesc.loc[[r], 'relroot'])
df = pd.concat(dfl)
df.describe()

In [None]:
# # save tje data to an h5 file, mode ='w' means it will overwrite the file there. 
savefile = os.path.join(datapath, 'autoscan.h5')

save_opts = {
    'complevel':9,
    'format':'table'
}

df.reset_index(drop = False).to_hdf(savefile, key = 'data', mode = 'w', **save_opts)
fdesc.reset_index(drop = False).to_hdf(savefile, key = 'description',  mode = 'a', **save_opts)

In [None]:
# sel_data = fdesc.query("tag == 'ah_001' & subtag == 's0' & instance == 'after' & probe == 'perm'")
# pp.read_sample_data(sel_data, datapath = datapath, infer_row_settings = True, 
#                     na_values = ['None', 'mm', 'cm^-1', 'cm', 'mD', 'Pa', 'm/s', 'm', 's', 'degrees'])
# pp.read_data(os.path.join(datapath, sel_data.loc[:, 'relroot'].values[0]), probe = 'perm', infer_row_settings = True)