In [None]:
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path

# define the current path (notebooks in lab_utils)
currpath = os.getcwd()
labutilspath = str(Path(currpath).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

In [None]:
# define paths
basepath = '/home/nuburu/sandbox/data/characterization/'

# set the asdatapath accordingly (where is the atuoscan data inside basedatapath?)
asdatapath   = 'autoscan'

# set datapath
datapath = os.path.join(basepath, asdatapath)

pp = autoscan.postprocess(labutilspath = labutilspath)
pp.debug = False

In [None]:
# load the summary dataframe with all measurement
summary = pd.read_csv(os.path.join(datapath,'summary.csv'))

# remove ztop and zbottom
# t = summary['relroot'].apply(lambda x: len(re.findall(r'ztop|zbottom', x))==0)
# summary = summary.loc[t,:].copy()

# shortern column's names
summary.rename(columns = pp.info_columns_short, inplace = True)

# set side to a numeric value (this makes it easy to save data and avoid performance hits due to dtypes mismatchs)
summary.side.fillna('a', inplace = True)
summary.side = summary.side.apply(pp._val_replace)

# set s0 as the default subsample when none is defined
summary.subtag.replace(np.nan, 's0', inplace = True) 

# sort the columns in an specific order
sortcols = ['tag','subtag', 'instance', 'side', 'probe']
summary.sort_values(by = sortcols, ignore_index = True, inplace = True)

fdesc = summary.set_index(['family', 'code'] + sortcols[:-1]).loc[:, ['probe', 'relroot']].copy()
fdesc = pp._fix_repeated_probes(fdesc, debug = False)

print('\n####### summary #######\n', fdesc.describe().iloc[[0, 1, 3], :], '\n', sep = '\n')
# fdesc.info(verbose = False)
N = fdesc.index.nunique(dropna = False)

print('number of unique samples: %d' % (N))

In [None]:
dfl = []
db = pp.empty_dataframe()
key_names = fdesc.index.names + ['m']
fdesc['loaded'] = False

for k, r in enumerate(fdesc.index.unique()):
    ftest = fdesc.loc[[r], :]
    print("loaded: %2.2f" % (100 * k / N), r, sep = '\t', end = '\r')
    try:
        df = pp.read_sample_data(ftest, datapath = datapath)        
        dfl.append(df)
        del df
        fdesc.loc[[r], 'loaded'] = True
    except:
        print('\n\ncheck ', fdesc.loc[[r], 'relroot'])
df = pd.concat(dfl)

df.describe()

In [None]:
# # this is needed because level 5 (side) has a mix of str and int. The best would be to fix it from the start in summary
# df.reset_index(drop = False, inplace = True)
# df.side = df.side.apply(val_replace)
# df.set_index(keys = key_names, inplace = True)

In [None]:
# save tje data to an h5 file, mode ='w' means it will overwrite the file there. 
savefile = os.path.join(datapath, 'autoscan.h5')
df.to_hdf(savefile, key = 'data', mode = 'w', data_columns = True, complevel = 9)
fdesc.to_hdf(savefile, key = 'description',  mode = 'a')