# Get the size of the datasets on disk

For the manuscript on the fishes, we want to know how much data we produced.
This notebook is based on a copy of `DataWrangling.ipynb` and https://github.com/habi/zmk-tooth-cohort/blob/master/ToothDataSize.ipynb

In [None]:
import platform
import os
import glob
import pandas
from tqdm import notebook

In [None]:
# The canonical place for *this* notebook is the iee research storage, as only there we have *all* the data
if 'Linux' in platform.system():
    Root = os.path.join(os.sep, 'home', 'habi', 'research-storage-iee')
else:
    Root = os.path.join('I:\\microCTupload')
print('We are loading all the data from %s' % Root)

In [None]:
def get_git_hash():
    '''
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    '''
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git',
                        '--git-dir',
                        os.path.join(os.getcwd(), '.git'),
                        'rev-parse',
                        '--short',
                        '--verify',
                        'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [None]:
# Make directory for output
OutPutDir = os.path.join(os.getcwd(), 'Output', get_git_hash())
print('We are saving all the output to %s' % OutPutDir)
os.makedirs(OutPutDir, exist_ok=True)

In [None]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [None]:
# Get *all* log files
# Sort them by time, not name
Data['LogFile'] = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'),
                                               recursive=True),
                                     key=os.path.getmtime)]
print('We have %s log files to work with' % (len(Data)))

In [None]:
# Get all folders
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]

In [None]:
# Generate us some meaningful colums
Data['Fish'] = [l[len(Root) + 1:].split(os.sep)[0] for l in Data['LogFile']]
Data['Scan'] = ['_'.join(l[len(Root) + 1:].split(os.sep)[1:-1]) for l in Data['LogFile']]

In [None]:
# How many fishes did we scan?
# We scanned six 'buckets of fish' and one set of only 'teeth', so subtract those :)
print('We have %s unique names in our corpus of scan' % (len(Data.Fish.unique()) - 7))
print('We performed %s scans in total' % len(Data.Scan))

In [None]:
# Temporarily drop some data
# Data = Data[:3]
# print('We are currently working with a subset of %s teeth' % len(Data))

In [None]:
for i in Data.Folder[:10]:
    print(i)

In [None]:
# Get the projection details
# Let's look for 'tif' *and* 'iif' files, which are alignment projections
Data['Projections'] = [sorted(glob.glob(os.path.join(folder,
                                                     '*.?if'))) for folder in Data['Folder']]
Data['NumberOfProjections'] = [len(r) for r in Data['Projections']]

In [None]:
# Get the size of the TIFFs
Data['ProjectionSize'] = [[os.path.getsize(rec) for rec in recs] for recs in Data['Projections']]
Data['ProjectionSizeSum'] = [sum(size) for size in Data['ProjectionSize']]

In [None]:
Data[['Folder', 'NumberOfProjections', 'ProjectionSize', 'ProjectionSizeSum']]

To get (nearly) the same size, use
````bash
find . -iname '*.?if' -print0 | du -ch --files0-from=-
````
in a Linux console.
The command is based on https://askubuntu.com/a/558989/759778

In [None]:
print('In total, all projections are %0.2f GB in size' % (Data['ProjectionSizeSum'].sum() / 1024 / 1024 / 1024))

In [None]:
print('In total, all projections are %0.2f TB in size' % (Data['ProjectionSizeSum'].sum() / 1024 / 1024 / 1024 / 1024))

----

In [None]:
# Get the file names of the reconstructions
Data['Reconstructions'] = [sorted(glob.glob(os.path.join(f, '*rec0*.png'))) for f in Data['Folder']]
Data['NumberOfReconstructions'] = [len(r) for r in Data.Reconstructions]

In [None]:
print('In total, we have %s reconstructions for all the %s datasets'
      % (Data['NumberOfReconstructions'].sum(),
         len(Data)))

In [None]:
print('On average, each of the %s datasets has about %s reconstructions.'
      % (len(Data),
         int(round(Data['NumberOfReconstructions'].mean()))))

In [None]:
# Drop samples which have not been reconstructed yet
# Based on https://stackoverflow.com/a/13851602
for c, row in Data.iterrows():
    if not row['Number of reconstructions']:
        print('%s contains no PNG files, we might be currently reconstructing it' % row.Folder)
print('We have %s folders in total' % (len(Data)))
print("Of which %s folders contain reconstructions (Data['NumberOfReconstructions']>0)" % (len(Data[Data['NumberOfReconstructions'] > 0])))

In [None]:
# Get the size of the reconstructions
Data['ReconstructionSize'] = [[os.path.getsize(rec) for rec in recs] for recs in Data['Reconstructions']]
Data['ReconstructionSizeSum'] = [sum(sizes) for sizes in Data['ReconstructionSize']]

In [None]:
print('In total, the reconstructions are %0.2f GB in size' % (Data['ReconstructionSizeSum'].sum() / 1024 / 1024 / 1024))

In [None]:
print('In total, the reconstructions are %0.2f TB in size' % (Data['ReconstructionSizeSum'].sum() / 1024 / 1024 / 1024 / 1024))

To get (nearly) the same size, use

````bash
find . -iname '*rec0*.png' -print0 | du -ch --files0-from=-
````

in a Linux console