## Setup

In [None]:
pwd

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth',500)
%matplotlib inline

# dir = {'900':'/data/HCP/HCP_900/s3/hcp/**/*','1200': '/data/HCP/HCP_1200/**/*'}
dirs = list(Path('/data/HCP/HCP_900/s3/hcp').iterdir())
# cols = ['perms','links','user','group','size','year','time','dir']
cols = ['size','date','dir']
pklz_dir = Path('output_pklz')
if not pklz_dir.exists():
    pklz_dir.mkdir()
summary_pklz = pklz_dir.joinpath('summary.pklz')

In [None]:
def get_subdir(d_path):
    return d_path.name
def get_output_path(d_path):
    return Path('output_pklz').joinpath('files_characterization_' + d_path.parent.name + '_' + get_subdir(d_path) + '.pklz')
def write_tsv_with_atimes_and_size(d_path):
        #     -lu gives access time
    #     -d1 gives just the file/dir instead of the contents
    #     the glob pattern provides all the files/dirs
    #     the awk command turns it into tab separated output
    # need to have globstar set to on in bash: shopt -s globstar
    output_file = get_output_path(d_path)
    print(d_path.as_posix() + '/**/*')
    ! shopt -s globstar;ls -lu -d1  --time-style long-iso {d_path.as_posix() + '/**/*'}| awk -v OFS="\t" '$1=$1'|cut -f5,6,8 > {output_file.with_suffix('.tsv')}


## Write pickle for every subject

In [None]:
# dir = {'test': '/data/HCP/HCP_1200/download_swarm/**/*'}
for d_path in dirs:
    output_file = get_output_path(d_path)
    if not output_file.exists():
        write_tsv_with_atimes_and_size(d_path)
        df = pd.read_csv(output_file.with_suffix('.tsv'),sep = '\t',names=cols, dtype = {'size':np.int32,'date':str,'dir' : str })
        df.to_pickle(output_file)
        output_file.with_suffix('.tsv').unlink()

## Load 50 random subjects and assess the access times in their file trees.

### Define helper functions:

In [None]:
def get_least_common_value(series):
    return series[series.apply(len).idxmin()]
def get_dir_level_summary(df,level = 7):
    df_grouped = (
        df.loc[pd.notnull(df[level+ 1]) ,:].
        groupby(list(range(level + 1)))
    )
    df = (
        df_grouped.
        aggregate({'date':max,'size': sum, 'file':len,'parent_dir' : lambda x :get_least_common_value(x)}).
        assign(total_size_gb = lambda df: round(df['size'] /1000000000,3)).
        rename(columns = {'date':'most_recent_access',
                     'file' : 'num_files'}).
#         reset_index(drop = True).
        assign(tree_depth = level)
        
    )
    return df
    
# test = pd.concat([df_sub.head(100), df_sub.head(100).file.str.split('/',expand = True)], axis = 1)
# get_dir_level_summary(test, 7)

### Create merged dataframe

In [None]:
try:
    del df_full
except:
    pass

    
for d_path in np.random.choice( dirs, 50):
    output_file = get_output_path(d_path)
    df_sub = pd.read_pickle(output_file)
    df_sub = df_sub.rename(columns = {'dir' : 'file'})
    df_sub['subject'] = d_path.name
    df_sub['is_file'] = df_sub.file.apply(lambda x: Path(x).is_file())
    df_sub['parent_dir'] = df_sub.file.apply(lambda x:'/'.join(x.split('/')[:-1]))

    if 'df_full' in locals():
        next_ind = df_full.index.max() + 1
        df_full  = pd.concat([df_full,df_sub],axis = 0)
    else:
        df_full = df_sub.copy()

In [None]:
df_split = pd.concat([df_full, df_full.file.str.split('/',expand = True)], axis = 1)

### Generate summary dataframe

In [None]:
# from IPython.core.debugger import Pdb; ipdb=Pdb()
# ipdb.runcall(get_dir_level_summary, df_split, 8)
df_summary = get_dir_level_summary(df_split, 10)
df_summary.to_pickle(summary_pklz)
df_summary.head()

In [None]:
df_summary.info()