In [1]:
%matplotlib inline
import os, shutil, warnings, string
import pandas as pd
import deepdish as dp
import numpy as np
import re
import fnmatch

# libs for plotting
import bokeh
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.models.tools import HoverTool
from bokeh.models import LinearColorMapper, ColorBar, BasicTicker, Select, PrintfTickFormatter
from bokeh.transform import jitter
from bokeh.sampledata.commits import data
from bokeh.palettes import brewer
import holoviews

# libs for stats
import time
from sklearn.manifold import TSNE

# print in notebok
output_notebook()

In [2]:
datapath = '/Volumes/extreme/repos/lab-data/laser-experiments/'
folders_include    = ['oo-nir']
files_exclude      = ['.*','_*','*.asd','*.tcl','*.h5']
files_include      = ['*.txt']

columns = np.load('./columns.npy')

In [3]:
def read_nirdata(file,process_time = True, skiprows = 15):
    df = pd.read_csv(file, sep = '\t', skiprows = skiprows, header=None)
    dt = _get_nirdata_dt(file)
    if (df.shape[1]==2):
        df = df.T.copy()
    else:
        df = df.loc[:,2:].copy()
    df.columns = df.loc[0,:].apply(int)
    df.drop(index=0, inplace=True)
    df['dt'] = np.float64(dt)
    if process_time:
        df['rel_time'] = np.cumsum(df.loc[:,'dt'].values)
    return df

def _get_nirdata_dt(file):
    dt = pd.read_csv(file, sep = ':', skiprows=5, nrows=1).iloc[0,1]
    return dt

def _check_columns(df, columns, ix = -3):
    if not ((df.columns[0] == 898) and (df.columns[ix]==2560)):
        df.columns = columns
    return df

## pre-process

In [None]:
dryrun = True
debug  = True
coerce_columns = True
excludes = r'|'.join([fnmatch.translate(x) for x in files_exclude]) or r'$.'
includes = r'|'.join([fnmatch.translate(x) for x in files_include]) or r'$.'
folders  = r'|'.join([fnmatch.translate(x) for x in folders_include]) or r'$.'
# df = pd.DataFrame(columns=['probe','sample-tag','subsample-tag','side','instance','fname','relroot'])

datadepth = len(datapath.split('/'))
dn = pd.DataFrame(columns=['exp-tag','sample-tag','subsample-tag','relroot'])
for root, dirs, files in os.walk(datapath, topdown=False):
    tags = {}
    # [dirs.remove(d) for d in list(dirs) if d not in folders_include]
    if len(re.findall(folders,root))==0:
        dirs[:]  = []
        files[:] = []
    else:
        files = [f for f in files if not re.match(excludes, f)]
        roots      = root.split('/')
        tags['exp-tag']    = root.split('/')[datadepth-1]
        tags['sample-tag'] = root.split('/')[datadepth]
        subornot = re.findall('subsample',root)
        if len(subornot)==1:
            subornot = subornot[0]
            tags['subsample-tag'] = root.split('/')[-2]
        else:
            tags['subsample-tag'] = ''
        
        flen = len(files)
        if flen>1:
            dfs = []
            for f in files:
                fname = os.path.join(root,f)
                df = read_nirdata(fname, process_time=False)
                df = _check_columns(df, columns[:-1], ix=-2)
                dfs.append(df)
            df = pd.concat(dfs, ignore_index=True)
            df['rel_time'] = np.cumsum(df.loc[:,'dt'].values)
        else:
            fname = os.path.join(root,files[0])
            df = read_nirdata(fname)
            df = _check_columns(df, columns)

        for key in ['exp','sample','subsample']:
            skey = key + '-tag'
            df[skey] = tags[skey]
        print(df.shape)
        sname = os.path.join(root,'oo-nir.h5')
        dp.io.save(sname,df)
        tags['relroot'] = os.path.relpath(os.path.join(root,sname),start=datapath)
        dn = dn.append(tags, ignore_index=True)
dn.sort_values(by=['exp-tag','sample-tag','subsample-tag'], inplace=True)
dn.reset_index(inplace=True, drop=True)
dn.to_csv('oo-nir-files-compiled.csv')

## analytics

In [4]:
def _plot_tsne(tsne_results):
    if mapper is None:
         mapper = LinearColorMapper(palette='Viridis256', 
                           low=data.index.min(), high=data.index.max())

    p = figure(plot_width = 960, plot_height = 550)
    dtemp = pd.DataFrame({'x':tsne_results[:,0], 'y':tsne_results[:,1],'c':data.index.values})
    dtemp.head()
    p.scatter(x='x', y = 'y', source=dtemp, size=2, 
              color={'field': 'c', 'transform': mapper})
    show(p)
    
    return p 

def _tsne(df, N, n_components=2, save=True, outdir='./', savename='tsne-out', save_reduced_data=True):
    nrows = df.shape[0]
    if nrows<N:
        N = nrows
        
    df = df.sample(N).copy()
    
    time_start = time.time()
    tsne = TSNE(n_components=n_components, verbose=1, perplexity=40, n_iter=1000)
    tsne_results = tsne.fit_transform(df.values)
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
    
    out = pd.DataFrame(tsne_results)
    
    if save:
        dtout = {'tsne-results': out}
        if save_reduced_data:
            dtout['tsne-data'] = df
        for sfx in dtout.keys():
            savename = sfx + '-' + savename + '.h5'
            sname = os.path.join(outdir,savename)
            dp.io.save(sname, dtout[sfx])

    return out, df

def _gen_mapper(df):
    imax = df.loc[:,:].max().max()
    imin = df.loc[:,:].min().min()
    mapper = LinearColorMapper(palette='Viridis256', 
                           low=imin, high=imax)
    return mapper

In [5]:
descriptions = pd.read_csv('/sandbox/dev/lab-utils/oo-nir/oo-nir-files-compiled.csv')
descriptions.fillna('', inplace=True)

In [18]:
dfs = []
for i in range(0,descriptions.shape[0]):
    savename = '-'.join(descriptions.loc[i,['exp-tag','sample-tag','subsample-tag']].values)[:-1]
    r = descriptions.loc[i,'relroot']
    fname = os.path.join(datapath,r)
    df = dp.io.load(fname)
    df = df.iloc[:,:-5].copy()
    tsne_results,_ =  _tsne(df, 10000, save=True, n_components=3, outdir='./_out', savename=savename)
    del _
    for key in ['exp-tag','sample-tag','subsample-tag']:
        tsne_results[key] = descriptions.loc[i,key]
    dfs.append(tsne_results)
    print(savename, fname, sep='\n')

df = pd.concat(dfs,ignore_index=True)
dp.io.save('./_out/tsne-10000.h5',df)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.292s...
[t-SNE] Computed neighbors for 10000 samples in 48.566s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 39.470736
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.433319
[t-SNE] Error after 1000 iterations: 2.271963
t-SNE done! Time elapsed: 550.

[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 53.542236
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.737152
[t-SNE] Error after 1000 iterations: 0.945960
t-SNE done! Time elapsed: 415.80848002433777 seconds
perforation-SH-00001
/Volumes/extreme/repos/lab-data/laser-experiments/perforation/SH-00001/oo-nir/oo-nir.h5
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.310s...
[t-SNE] Computed neighbors for 10000 samples in 21.798s...
[t

[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.440483
[t-SNE] Error after 1000 iterations: 1.353738
t-SNE done! Time elapsed: 384.871062040329 seconds
underwater-perf-LSsy-00013
/Volumes/extreme/repos/lab-data/laser-experiments/underwater-perf/LSsy-00013/oo-nir/oo-nir.h5


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  filters=filters, idtable=idtable)


In [6]:
dfs = []

descriptions['laser-on'] = True

def _log10(x):
    out = np.zeros(x.shape)
    out[x>0] = np.log10(x[x>0])
    out[x<=0] = 0
    return out

for i in range(0,descriptions.shape[0]):
    savename = '-'.join(descriptions.loc[i,['exp-tag','sample-tag','subsample-tag']].values)[:-1]
    savename = 'reduced-avg-' + savename
    
    r = descriptions.loc[i,'relroot']
    fname = os.path.join(datapath,r)
    df = dp.io.load(fname)
    df = df.iloc[:,:-5].copy()
    # seek the values above threshold (laser on)
    tindex = df.apply(np.average,axis=1).values>100
    df = df.iloc[tindex,:-5].copy()
    if df.shape[0]>0:
        # apply log10 to data
        df = df.apply(lambda x: _log10(x)).copy()

        tsne_results,_ =  _tsne(df, 10000, save=True, n_components=2, outdir='./_out', savename=savename)
        del _
        for key in ['exp-tag','sample-tag','subsample-tag']:
            tsne_results[key] = descriptions.loc[i,key]
        dfs.append(tsne_results)
        print(savename, fname, sep='\n')
    else:
        descriptions.loc[i,'laser-on'] = False

df = pd.concat(dfs,ignore_index=True)
dp.io.save('./_out/tsne-10000-reduced-avg-2.h5',df)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.213s...
[t-SNE] Computed neighbors for 10000 samples in 49.878s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.074029
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.417229
[t-SNE] Error after 1000 iterations: 2.444390
t-SNE done! Time elapsed: 336.6

[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.068658
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.838799
[t-SNE] Error after 1000 iterations: 1.500588
t-SNE done! Time elapsed: 332.83507108688354 seconds
reduced-avg-scale-removal-ScFeS-00001
/Volumes/extreme/repos/lab-data/laser-experiments/scale-removal/ScFeS-00001/oo-nir/oo-nir.h5
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 8546 samples in 0.181s...
[t-SNE] Computed neighbors for 8546 samples in 57.320s...
[t-SNE] Computed conditional probabilities for sam

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  filters=filters, idtable=idtable)


In [39]:
df = dp.io.load(fname)
df = df.iloc[:,:-5].copy()
# seek the values above threshold (laser on)
tindex = df.apply(np.average,axis=1).values>50
df = df.iloc[tindex,:-5].copy()

In [43]:
df.shape[0]>0

False

In [36]:
descriptions.loc[i,'laser-on'] = False

In [None]:
for key in ['exp-tag','sample-tag','subsample-tag']:
    df[key] = descriptions.loc[i,key]

In [15]:
tsne_results,_ =  _tsne(df, 100, save=True, n_components=3, outdir='./_out', savename=savename)
del _

[t-SNE] Computing 99 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 203.886620
[t-SNE] KL divergence after 250 iterations with early exaggeration: 85.415367
[t-SNE] Error after 1000 iterations: 1.287474
t-SNE done! Time elapsed: 1.5822548866271973 seconds


In [16]:
for key in ['exp-tag','sample-tag','subsample-tag']:
    tsne_results[key] = descriptions.loc[i,key]
dfs.append(tsne_results)
print(savename, fname, sep='\n')

cca-remedy-MSt-00005
/Volumes/extreme/repos/lab-data/laser-experiments/cca-remedy/MSt-00005/oo-nir/oo-nir.h5


In [17]:
dfs

[             0           1           2     exp-tag sample-tag subsample-tag
 0   -57.486706  -27.467167 -123.752319  cca-remedy  MSt-00005              
 1     5.583789   45.571819 -112.640793  cca-remedy  MSt-00005              
 2   157.966751 -120.225006  106.597809  cca-remedy  MSt-00005              
 3   -87.501167 -112.429489  -41.193211  cca-remedy  MSt-00005              
 4  -116.298653  -60.556286   90.623199  cca-remedy  MSt-00005              
 5  -147.398575  -66.437904  -68.440941  cca-remedy  MSt-00005              
 6   -75.179298  141.438416 -106.696106  cca-remedy  MSt-00005              
 7    47.320713  -81.261490   84.402443  cca-remedy  MSt-00005              
 8    14.794386  -11.858595 -200.777603  cca-remedy  MSt-00005              
 9    45.218800  -81.464317  -29.106478  cca-remedy  MSt-00005              
 10  124.375504 -133.048721    7.193214  cca-remedy  MSt-00005              
 11  143.568130  -18.430649 -143.529221  cca-remedy  MSt-00005              