# Get the size of the datasets on disk
For the manuscript on the fishes, we want to know how much data we produced.
This notebook is based on a copy of `DataWrangling.ipynb` and https://github.com/habi/zmk-tooth-cohort/blob/master/ToothDataSize.ipynb

In [None]:
import platform
import os
import glob
import pandas
from tqdm import notebook

In [None]:
# The canonical place for *this* notebook is the iee research storage, as only there we have *all* the data
if 'Linux' in platform.system():
    Root = os.path.join(os.sep, 'home', 'habi', 'research-storage-iee', 'microCT')
else:
    Root = os.path.join('I:\\microCTupload')
print('We are loading all the data from %s' % Root)

In [None]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [None]:
# Get *all* log files, unsorted but fast
Data['LogFile'] = [os.path.join(root, name)
                   for root, dirs, files in os.walk(Root)
                   for name in files
                   if name.endswith((".log"))]
print('We have %s log files to work with' % (len(Data)))

In [None]:
# Get all folders
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]

In [None]:
# Since we look for *all* log files, we have a lot of duplicated folder
# Do some deduplication as further down we look for files in *folders*
Data.drop_duplicates(subset=['Folder'], inplace=True)
Data = Data.reset_index(drop=True)

In [None]:
# Generate us some meaningful colums
Data['Fish'] = [l[len(Root) + 1:].split(os.sep)[0] for l in Data['LogFile']]
Data['Scan'] = ['.'.join(l[len(Root) + 1:].split(os.sep)[1:-1]) for l in Data['LogFile']]

In [None]:
def get_machine(logfile):
    """Get the machine we used to scan"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Scanner' in line:
                machine = line.split('=')[1].strip()
    return(machine)

In [None]:
Data['Scanner'] = [get_machine(log) for log in Data['LogFile']]

In [None]:
# The iee research storage folder contains some folders with scans done by Kassandra on a SkyScan1273.
# Exclude those, since they are not part of this study, we just looked at them to help her.
for c, row in Data.iterrows():
    if '1273' in row.Scanner:
        print('Dropping %s from our dataframe' % row.LogFile[len(Root)+1:])
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# The iee research storage folder contains folders with scans of only teeth, done as pilot
# Exclude those, since they are not part of this study.
for c, row in Data.iterrows():
    if 'Teeth' in row.Folder:
        print('Dropping %s from our dataframe' % row.LogFile[len(Root)+1:])
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# How many fishes did we scan?
# We scanned six 'buckets of fish', so subtract those :)
print('We have %s unique fish names in our corpus of scans' % (len(Data.Fish.unique()) - 6))
print('We have %s different proj and rec folders in total' % len(Data.Scan))

In [None]:
# Temporarily drop some data
# Data = Data[:3]
# print('We are currently working with a subset of %s teeth' % len(Data))

In [None]:
# Get the projection details
# Let's look for 'tif' *and* 'iif' files, which are alignment projections
# Get the file names of the projections
Data['Projections'] = [[os.path.join(root, name)
                        for root, dirs, files in os.walk(f)
                        for name in files
                        if name.endswith(("if"))] for f in Data['Folder']]
# Count how many files we have
Data['NumberOfProjections'] = [len(r) for r in Data.Projections]

In [None]:
# Get the size of the TIFFs
Data['ProjectionSize'] = [[os.path.getsize(proj) for proj in projections] for projections in Data['Projections']]
Data['ProjectionSizeSum'] = [sum(size) for size in Data['ProjectionSize']]

In [None]:
print('In total, all projections are %0.2f GB in size' % (Data['ProjectionSizeSum'].sum() / 1024 / 1024 / 1024))

In [None]:
print('In total, all projections are %0.1f TB in size' % (Data['ProjectionSizeSum'].sum() / 1024 / 1024 / 1024 / 1024))

In [None]:
Data[['Folder', 'NumberOfProjections', 'ProjectionSize', 'ProjectionSizeSum']].head()

To get (nearly) the same size, use
````bash
find . -iname '*.?if' -print0 | du -ch --files0-from=-
````
in a Linux console.
The command is based on https://askubuntu.com/a/558989/759778

----

In [None]:
# Get the file names of the reconstructions
Data['Reconstructions'] = [[os.path.join(root, name)
                            for root, dirs, files in os.walk(f)
                            for name in files
                            if 'rec0' in name and name.endswith((".png"))] for f in Data['Folder']]
# Count how many files we have
Data['NumberOfReconstructions'] = [len(r) for r in Data.Reconstructions]

In [None]:
print('We have a total of %s reconstructions on %s' % (Data['NumberOfReconstructions'].sum(), Root))

In [None]:
print('This is about %s reconstructions per scan (%s scans, %s fishes)' % (round(Data['NumberOfReconstructions'].sum() / len(Data[Data['NumberOfReconstructions'] > 0])),
                                                                           len(Data[Data['NumberOfReconstructions'] > 0]),
                                                                           len(Data.Fish.unique()) -6 ))

In [None]:
print('In total, we have %s reconstructions for all the %s datasets'
      % (Data['NumberOfReconstructions'].sum(),
         len(Data)))

In [None]:
print('On average, each of the %s datasets has about %s reconstructions.'
      % (len(Data),
         int(round(Data['NumberOfReconstructions'].mean()))))

In [None]:
# Drop samples which have no reconstructions
# Based on https://stackoverflow.com/a/13851602
for c, row in Data.iterrows():
    if not row['NumberOfReconstructions']:
        print('%s contains no PNG files, we drop it for the rest of the notebook' % row.Folder)
print('We have %s folders in total' % (len(Data)))
print("Of which %s folders contain reconstructions (Data['NumberOfReconstructions']>0)" % (len(Data[Data['NumberOfReconstructions'] > 0])))

In [None]:
# Get the size of the reconstructions
Data['ReconstructionSize'] = [[os.path.getsize(rec) for rec in recs] for recs in Data['Reconstructions']]
Data['ReconstructionSizeSum'] = [sum(sizes) for sizes in Data['ReconstructionSize']]

In [None]:
print('In total, the reconstructions are %0.2f GB in size' % (Data['ReconstructionSizeSum'].sum() / 1024 / 1024 / 1024))

In [None]:
print('In total, the reconstructions are %0.1f TB in size' % (Data['ReconstructionSizeSum'].sum() / 1024 / 1024 / 1024 / 1024))

To get (nearly) the same size, use
````bash
find . -iname '*rec0*.png' -print0 | du -ch --files0-from=-
````
in a Linux console