In [3]:
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path

# define the current path (notebooks in lab_utils)
currpath = os.getcwd()
labutilspath = str(Path(currpath).parents[1])
sys.path.append(labutilspath)

# import the autoscan routines
from autoscan import autoscan

In [4]:
# define paths
basepath = '/home/nuburu/sandbox/data/characterization/'

# set the asdatapath accordingly (where is the atuoscan data inside basedatapath?)
asdatapath   = 'autoscan'

# set datapath
datapath = os.path.join(basepath, asdatapath)

pp = autoscan.postprocess(labutilspath = labutilspath)
pp.debug = False

In [5]:
# load the summary dataframe with all measurement
summary = pd.read_csv(os.path.join(datapath,'summary.csv'))

# remove ztop and zbottom
# t = summary['relroot'].apply(lambda x: len(re.findall(r'ztop|zbottom', x))==0)
# summary = summary.loc[t,:].copy()

# shortern column's names
summary.rename(columns = pp.info_columns_short, inplace = True)

# set side to a numeric value (this makes it easy to save data and avoid performance hits due to dtypes mismatchs)
summary.side.fillna('a', inplace = True)
summary.side = summary.side.apply(pp._val_replace)

# set s0 as the default subsample when none is defined
summary.subtag.replace(np.nan, 's0', inplace = True) 

# sort the columns in an specific order
sortcols = ['tag','subtag', 'instance', 'side', 'probe']
summary.sort_values(by = sortcols, ignore_index = True, inplace = True)

fdesc = summary.set_index(['family', 'code'] + sortcols[:-1]).loc[:, ['probe', 'relroot']].copy()
fdesc = pp._fix_repeated_probes(fdesc, debug = False)

print('\n####### summary #######\n', fdesc.describe().iloc[[0, 1, 3], :], '\n', sep = '\n')
# fdesc.info(verbose = False)
N = fdesc.index.nunique(dropna = False)

print('number of unique samples: %d' % (N))

[]
all good

####### summary #######

       probe relroot
count    749     749
unique     4     749
freq     222       1


number of unique samples: 249


In [6]:
dfl = []
db = pp.empty_dataframe()
key_names = fdesc.index.names + ['m']
fdesc['loaded'] = False

for k, r in enumerate(fdesc.index.unique()):
    ftest = fdesc.loc[[r], :]
    print("loaded: %2.2f" % (100 * k / N), r, sep = '\t', end = '\r')
    try:
        df = pp.read_sample_data(ftest, datapath = datapath)        
        dfl.append(df)
        del df
        fdesc.loc[[r], 'loaded'] = True
    except:
        print('\n\ncheck ', fdesc.loc[[r], 'relroot'])
df = pd.concat(dfl)

df.describe()

loaded: 99.60	('sandstone', 'sg', 'wsg_006', 'plugs', 'before', 4))', 4)

Unnamed: 0,x,y,l_1,l_2,l_3,l_4,l_5,l_6,l_7,l_8,...,l_1749,l_1750,l_1751,l_1752,e_star,perm,vp_0,vs_0,vp_1,vs_1
count,69858.0,69858.0,45109.0,45109.0,45109.0,45109.0,45109.0,45109.0,45109.0,45109.0,...,45109.0,45109.0,45109.0,45109.0,27170.0,41050.0,42187.0,42187.0,24659.0,24659.0
mean,116.433057,61.07974,2.206719,2.288839,2.383208,2.454404,2.469428,2.452037,2.432671,2.417937,...,2.138742,2.138171,2.137363,2.136565,19.131374,1262.674,4017.648283,2617.98556,6247.876,26867.86
std,133.272734,61.559122,0.520469,0.515309,0.510911,0.51078,0.515536,0.522515,0.529164,0.536344,...,0.604827,0.604586,0.603905,0.603592,11.944766,49054.13,1883.492838,4019.730304,54564.77,269414.3
min,0.0,0.0,0.70306,0.84283,0.85232,0.85335,0.8551,0.85662,0.85881,0.856,...,1.42681,1.42868,1.42926,1.42761,0.486411,0.469854,330.519,-89552.238806,-1217391.0,-3181818.0
25%,27.0,12.0,2.01393,2.09413,2.18302,2.25204,2.26835,2.24806,2.22156,2.19704,...,1.72795,1.72779,1.72742,1.72716,12.4645,1.765512,3180.806107,1849.201984,2794.597,1852.06
50%,70.0,40.0,2.24698,2.3351,2.43431,2.50897,2.52718,2.50831,2.48264,2.45721,...,1.98054,1.98013,1.97912,1.97839,15.6127,2.18199,3824.614124,2486.457686,4542.849,2585.92
75%,150.0,100.0,2.43029,2.50955,2.60665,2.68861,2.71014,2.69606,2.68284,2.67309,...,2.3013,2.30121,2.30015,2.29914,22.3351,60.2787,4741.556924,2819.416381,4913.804,2780.941
max,782.0,500.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,119.882,5307500.0,69953.364424,7526.477071,4468085.0,35000000.0


In [11]:
# # this is needed because level 5 (side) has a mix of str and int. The best would be to fix it from the start in summary
# df.reset_index(drop = False, inplace = True)
# df.side = df.side.apply(val_replace)
# df.set_index(keys = key_names, inplace = True)

ftir
perm


In [7]:
# save tje data to an h5 file, mode ='w' means it will overwrite the file there. 
savefile = os.path.join(datapath, 'autoscan.h5')
df.to_hdf(savefile, key = 'data', mode = 'w', data_columns = True, complevel = 9)
fdesc.to_hdf(savefile, key = 'description',  mode = 'a')