# Get total dataset size
This [Jupyter](http://jupyter.com) notebook was used to see how big the datasets were that we analized in the [publication on the acinar complexity](https://www.authorea.com/274247/47HwqAxume3L2xkLOsg_SQ).

In [1]:
import platform
import glob
import os
import pandas

In [2]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = os.path.join(os.sep, 'home', 'habi', 'nas_gruppe_schittny')
else:
    drive = os.path.join('\\\\nas.ana.unibe.ch\\', 'gruppe_schittny', 'Data')
# Load the data from this folder
RootPath = os.path.join(drive, 'doc', 'David')
print('We are loading all the data from %s' % RootPath)

We are loading all the data from /home/habi/nas_gruppe_schittny/doc/David


In [3]:
# Get a list of *all* excel files that Eveline exported from the STEPanizer
# Based on https://stackoverflow.com/a/14798263
StepanizerFiles = sorted(glob.glob(os.path.join(RootPath, 'Eveline', '**/*.xls'), recursive=True))

In [4]:
Animals = [os.path.basename(f).split('_R108C')[1].split('mrg-')[0][:3] for f in StepanizerFiles]  # all animals
Animals = sorted(list(set(Animals)))  # unique ones: https://stackoverflow.com/a/27305828/323100

In [5]:
print('Eveline counted the alveoli in %s acini' % len(StepanizerFiles))

Eveline counted the alveoli in 285 acini


In [6]:
print('Eveline assessed %s animals' % len(Animals))
for anml in Animals:
    print('-', anml)

Eveline assessed 13 animals
- 04A
- 04B
- 04C
- 10A
- 10B
- 10C
- 21B
- 21D
- 21E
- 60B
- 60C
- 60D
- 60E


Look for the reconstructions on `anatera4`, where we still have all the original files.

In [7]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = '/run/user/1000/gvfs/smb-share:server=anatera4,share='
else:
    drive = '\\\\anatera4\\'
# Load the data from this folder
terastation = drive + os.path.join('share', 'SLS')
print('We are loading all the data from %s' % terastation)

We are loading all the data from /run/user/1000/gvfs/smb-share:server=anatera4,share=share/SLS


In [8]:
# Read the data from other notebook, where we have the information on all the assessed original DICOM files
try:
    VolumesFromDisk = pandas.read_pickle(max(glob.iglob('VolumesFromDisk*.pkl'), key=os.path.getctime))
except ValueError:
    print('I was not able to find "VolumesFromDisk.pkl". '
          'Please run "Analysis.ipynb" where this file is generated...')

In [9]:
# Get the unique sample directories in one step (see https://stackoverflow.com/a/26032781/323100)
SampleDirectories = {os.path.dirname(i) for i in VolumesFromDisk.Location_Volume.unique()}

In [10]:
# Get the unique beamtime folders in one step
BeamtimeDirectories = {os.path.dirname(i) for i in SampleDirectories}

In [11]:
# Get *all* sample folders
SampleFolders = []
# For each of the relevant beamtimes...
for i in BeamtimeDirectories:
    # ...get all folders and subfolders
    for root, directories, files in os.walk(i):
        # Search in each found directory...
        for directory in directories:
            # ...if we find a folder which name matches one of the Animals
            for animal in Animals:
                if str('C' + animal) in directory:
                    SampleFolders.append(os.path.join(root, directory))

In [12]:
# Save the information into a dataframe
DataDetails = pandas.DataFrame()
DataDetails['Location'] = SampleFolders
DataDetails['Sample'] = [os.path.basename(s) for s in DataDetails['Location']]

In [13]:
# Look for all the reconstructions
DataDetails['Reconstructions'] = [glob.glob(os.path.join(l, '*rec*bit*', '*.tif')) for l in DataDetails['Location']]

In [14]:
### Get the size of the original TIF files
DataDetails['Sizes'] = [[os.stat(rec).st_size for rec in recs] for recs in DataDetails['Reconstructions']]
DataDetails['TotalSize'] = [sum(sizes) for sizes in DataDetails['Sizes']]

In [15]:
print('In total, all reconstructions are %0.2f GB in size' % (1e-9 * float(DataDetails.TotalSize.sum())))

In total, all reconstructions are 335.96 GB in size


In [16]:
DataDetails.head()

Unnamed: 0,Location,Sample,Reconstructions,Sizes,TotalSize
0,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Ab-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
1,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C10Ab-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
2,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C21Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
3,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04At-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
4,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8596822, 8596822, 8596822, 8596822, 8596822, ...",8803145728
