# Get total dataset size

In [1]:
import platform
import glob
import os
import pandas

In [2]:
def get_git_hash():
    """
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    """
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git', '--git-dir', os.path.join(os.getcwd(), '.git'),
                        'rev-parse', '--short', '--verify', 'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [3]:
the_current_git_hash = get_git_hash()
print('We are working with version %s of the analyis notebook'
      % the_current_git_hash)

We are working with version c0d391b of the analyis notebook


Now we load the 'Count' data from Eveline.

In [4]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = os.path.join(os.sep, 'home', 'habi', 'nas_gruppe_schittny')
else:
    drive = os.path.join('\\\\nas.ana.unibe.ch\\', 'gruppe_schittny', 'Data')
# Load the data from this folder
RootPath = os.path.join(drive, 'doc', 'David')
print('We are loading all the data from %s' % RootPath)

We are loading all the data from /home/habi/nas_gruppe_schittny/doc/David


In [5]:
# Get a list of *all* excel files that Eveline exported from the STEPanizer
# Based on https://stackoverflow.com/a/14798263
StepanizerFiles = sorted(glob.glob(os.path.join(RootPath, 'Eveline', '**/*.xls'), recursive=True))

In [6]:
print('Eveline counted the alveoli in %s acini' % len(StepanizerFiles))

Eveline counted the alveoli in 291 acini


In [7]:
Animals = [os.path.basename(f).split('_R108C')[1].split('mrg-')[0][:3] for f in StepanizerFiles]  # all animals
Animals = sorted(list(set(Animals)))  # unique ones: https://stackoverflow.com/a/27305828/323100

In [8]:
print('Eveline assessed %s animals' % len(Animals))
for anml in Animals:
    print('-', anml)

Eveline assessed 13 animals
- 04A
- 04B
- 04C
- 10A
- 10B
- 10C
- 21B
- 21D
- 21E
- 60B
- 60C
- 60D
- 60E


Look for the reconstructions on `anatera4`, where we still have all the original files.

In [9]:
# Different locations if running either on Linux or Windows
if 'debian' in platform.dist():
    drive = '/run/user/1000/gvfs/smb-share:server=anatera4,share='
else:
    drive = '\\\\anatera4\\'
# Load the data from this folder
terastation = drive + os.path.join('share', 'SLS')
print('We are loading all the data from %s' % terastation)

We are loading all the data from /run/user/1000/gvfs/smb-share:server=anatera4,share=share/SLS


In [10]:
# Read the data from other notebook
VolumesFromDisk = pandas.read_pickle('VolumesFromDisk.pkl')

In [11]:
# Get the unique sample directories in one step (see https://stackoverflow.com/a/26032781/323100)
SampleDirectories = {os.path.dirname(i) for i in VolumesFromDisk.Location_Volume.unique()}

In [12]:
# Get the unique beamtime folders in one step
BeamtimeDirectories = {os.path.dirname(i) for i in SampleDirectories}

In [38]:
# Get *all* sample folders
SampleFolders=[]
# For each of the relevant beamtimes...
for i in BeamtimeDirectories:
    # ...get all folders and subfolders
    for root, directories, files in os.walk(i):
        # Search in each found directory...
        for directory in directories:
            # ...if we find a folder which name matches one of the Animals
            for animal in Animals:
                if str('C' + animal) in directory:
                    SampleFolders.append(os.path.join(root, directory))

In [44]:
DataDetails = pandas.DataFrame()
DataDetails['Location'] = SampleFolders
DataDetails['Sample'] = [os.path.basename(s) for s in DataDetails['Location']]

In [45]:
DataDetails['Reconstructions'] = [glob.glob(os.path.join(l, '*rec*bit*', '*.tif')) for l in DataDetails['Location']]

In [46]:
DataDetails['Sizes'] = [[os.stat(rec).st_size for rec in recs] for recs in DataDetails['Reconstructions']]

In [47]:
DataDetails['TotalSize'] = [sum(sizes) for sizes in DataDetails['Sizes']]

In [19]:
print('Totally, all reconstructions are', round(1e-9 * float(DataDetails.TotalSize.sum()), 2), 'GB in size')

Totally, all reconstructions are 407.52 GB in size


In [49]:
DataDetails

Unnamed: 0,Location,Sample,Reconstructions,Sizes,TotalSize
0,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Ab-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
1,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C10Ab-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
2,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C21Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
3,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04At-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056
4,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8596822, 8596822, 8596822, 8596822, 8596822, ...",8803145728
5,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Bt-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8596822, 8596822, 8596822, 8596822, 8596822, ...",8803145728
6,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Cb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
7,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C04Ct-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
8,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C10At-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8608554, 8608554, 8608554, 8608554, 8608554, ...",8815159296
9,"/run/user/1000/gvfs/smb-share:server=anatera4,...",R108C10Bb-mrg,[/run/user/1000/gvfs/smb-share:server=anatera4...,"[8620294, 8620294, 8620294, 8620294, 8620294, ...",8827181056


In [53]:
DataDetails.Sample.unique

<bound method Series.unique of 0            R108C04Ab-mrg
1            R108C10Ab-mrg
2            R108C21Bb-mrg
3            R108C04At-mrg
4            R108C04Bb-mrg
5            R108C04Bt-mrg
6            R108C04Cb-mrg
7            R108C04Ct-mrg
8            R108C10At-mrg
9            R108C10Bb-mrg
10           R108C10Bt-mrg
11           R108C21Bm-mrg
12           R108C21Bt-mrg
13         R108C60C_B1-mrg
14         R108C60C_B2-mrg
15         R108C60C_B3-mrg
16        R108C04Aa_B1_mrg
17        R108C04Aa_B2_mrg
18         R108C10C_B1_mrg
19         R108C10C_B2_mrg
20         R108C60B_B1_mrg
21         R108C60B_B2_mrg
22     R108C04ArolA_B2_10c
23     R108C04ArolA_B1_10c
24     R108C10C_B1_mrg_10c
25     R108C10C_B2_mrg_10c
26     R108C21D_B1_mrg_10c
27     R108C21D_B2_mrg_10c
28     R108C21D_B3_mrg_10c
29     R108C21E_B1_mrg_10c
30     R108C21E_B2_mrg_10c
31     R108C21E_B3_mrg_10c
32     R108C60B_B1_mrg_10c
33     R108C60B_B2_mrg_10c
34     R108C60B_B3_mrg_10c
35     R108C10ArolA_B1_1