# Get dataset size

In [222]:
import platform
import glob
import os
import pandas

In [223]:
def get_git_hash():
    """
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    """
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git', '--git-dir', os.path.join(os.getcwd(), '.git'),
                        'rev-parse', '--short', '--verify', 'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [224]:
the_current_git_hash = get_git_hash()
print('We are working with version %s of the analyis notebook'
      % the_current_git_hash)

We are working with version 3d9f4fe of the analyis notebook


Now we load the 'Count' data from Eveline.

In [226]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = os.path.join(os.sep, 'home', 'habi', 'nas_gruppe_schittny')
else:
    drive = os.path.join('\\\\nas.ana.unibe.ch\\', 'gruppe_schittny', 'Data')
# Load the data from this folder
RootPath = os.path.join(drive, 'doc', 'David')
print('We are loading all the data from %s' % RootPath)

We are loading all the data from /home/habi/nas_gruppe_schittny/doc/David


In [227]:
# Get a list of *all* excel files that Eveline exported from the STEPanizer
# Based on https://stackoverflow.com/a/14798263
StepanizerFiles = sorted(glob.glob(os.path.join(RootPath, 'Eveline', '**/*.xls'), recursive=True))

In [228]:
print('Eveline counted the alveoli in %s acini' % len(StepanizerFiles))

Eveline counted the alveoli in 291 acini


In [246]:
Animals = [os.path.basename(f).split('_R108C')[1].split('mrg-')[0][:3] for f in StepanizerFiles]  # all animals
Animals = sorted(list(set(Animals)))  # unique ones: https://stackoverflow.com/a/27305828/323100

In [247]:
print('Eveline assessed these animals')
for anml in Animals:
    print('-', anml)

Eveline assessed these animals
- 04A
- 04B
- 04C
- 10A
- 10B
- 10C
- 21B
- 21D
- 21E
- 60B
- 60C
- 60D
- 60E


Look for the reconstructions on `anatera4`, where we still have all the original files.

In [248]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = '/run/user/1000/gvfs/smb-share:server=anatera4,share='
else:
    drive = '\\\\anatera4\\'
# Load the data from this folder
terastation = drive + os.path.join('share', 'SLS')
print('We are loading all the data from %s' % terastation)

We are loading all the data from /run/user/1000/gvfs/smb-share:server=anatera4,share=share/SLS


In [249]:
# Read the data from other notebook
VolumesFromDisk = pandas.read_pickle('VolumesFromDisk.pkl')

In [250]:
# Get the unique sample directories in one step (see https://stackoverflow.com/a/26032781/323100)
SampleDirectories = {os.path.dirname(i) for i in VolumesFromDisk.Location_Volume.unique()}

In [251]:
# Get the unique beamtime folders in one step
BeamtimeDirectories = {os.path.dirname(i) for i in SampleDirectories}

In [121]:
# Get *all* sample folders
SampleFolders=[]
for i in BeamtimeDirectories:
    for root, directories, files in os.walk(i):
        for directory in sorted(directories):
            for animal in Animals:
                if str('C' + animal) in directory:
                    SampleFolders.append(os.path.join(root, directory))

In [189]:
DataDetails = pandas.DataFrame()
DataDetails['Location'] = SampleFolders
DataDetails['Sample'] = [os.path.basename(s) for s in SampleFolders]

In [191]:
DataDetails['Reconstructions'] = [glob.glob(os.path.join(l, '*rec*bit*', '*.tif')) for l in DataDetails['Location']]

In [192]:
DataDetails['Sizes'] = [[os.stat(rec).st_size for rec in recs] for recs in DataDetails['Reconstructions']]

In [200]:
DataDetails['TotalSize'] = [sum(sizes) for sizes in DataDetails['Sizes']]

In [252]:
DataDetails.head()

Unnamed: 0,Location,Sample,Reconstructions,Sizes,TotalSize
0,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Ab-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
1,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04At-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
2,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8596822, 8596822, 8596822, 8596822, 8596822, ...",8803145728
3,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Bt-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8596822, 8596822, 8596822, 8596822, 8596822, ...",8803145728
4,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Cb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296


In [253]:
print('Totally, all reconstructions are', round(1e-9 * float(DataDetails.TotalSize.sum()), 2), 'GB in size')

Totally, all reconstructions are 407.52 GB in size
