# Handle the 'data' of the fishes
Wrestle with the data, check parameters and generate some helping files

In [1]:
import platform
import os
import glob
import pandas
import imageio
import numpy
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn
import dask
import dask_image.imread
from dask.distributed import Client, LocalCluster
import skimage
from tqdm import notebook

In [2]:
# Set dask temporary folder
# Do this before creating a client: https://stackoverflow.com/a/62804525/323100
import tempfile
if 'Linux' in platform.system():
    # Check if me mounted the FastSSD, otherwise go to standard tmp file
    if os.path.exists(os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')):
        tmp = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD', 'tmp')
    else:
        tmp = tempfile.gettempdir()
elif 'Darwin' in platform.system():
    tmp = tempfile.gettempdir()
else:
    if 'anaklin' in platform.node():
        tmp = os.path.join('F:\\tmp')
    else:
        tmp = os.path.join('D:\\tmp')
dask.config.set({'temporary_directory': tmp})
print('Dask temporary files go to %s' % dask.config.get('temporary_directory'))

Dask temporary files go to /media/habi/Fast_SSD/tmp


In [3]:
from dask.distributed import Client
client = Client()

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 32,Total memory: 125.80 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34829,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 32
Started: Just now,Total memory: 125.80 GiB

0,1
Comm: tcp://127.0.0.1:42863,Total threads: 4
Dashboard: http://127.0.0.1:40017/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:35755,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-qq9ahy4q,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-qq9ahy4q

0,1
Comm: tcp://127.0.0.1:37755,Total threads: 4
Dashboard: http://127.0.0.1:46199/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:43653,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-xs0ly3yp,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-xs0ly3yp

0,1
Comm: tcp://127.0.0.1:39281,Total threads: 4
Dashboard: http://127.0.0.1:41413/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:34941,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-ianiey7s,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-ianiey7s

0,1
Comm: tcp://127.0.0.1:33061,Total threads: 4
Dashboard: http://127.0.0.1:38789/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:44739,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-unqu2iyw,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-unqu2iyw

0,1
Comm: tcp://127.0.0.1:35935,Total threads: 4
Dashboard: http://127.0.0.1:44649/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:37115,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-_hl9t6hc,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-_hl9t6hc

0,1
Comm: tcp://127.0.0.1:39867,Total threads: 4
Dashboard: http://127.0.0.1:36265/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:35567,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-5m72te0r,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-5m72te0r

0,1
Comm: tcp://127.0.0.1:34949,Total threads: 4
Dashboard: http://127.0.0.1:39601/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:44031,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-ghjngzu8,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-ghjngzu8

0,1
Comm: tcp://127.0.0.1:35715,Total threads: 4
Dashboard: http://127.0.0.1:35215/status,Memory: 15.73 GiB
Nanny: tcp://127.0.0.1:36159,
Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-6ralqv52,Local directory: /media/habi/Fast_SSD/tmp/dask-worker-space/worker-6ralqv52


In [5]:
print('You can seee what DASK is doing at "http://localhost:%s/status"' % client.scheduler_info()['services']['dashboard'])

You can seee what DASK is doing at "http://localhost:8787/status"


In [6]:
# # Ignore warnings in the notebook
# import warnings
# warnings.filterwarnings("ignore")

In [7]:
# Set up figure defaults
plt.rc('image', cmap='gray', interpolation='nearest')  # Display all images in b&w and with 'nearest' interpolation
plt.rcParams['figure.figsize'] = (16, 9)  # Size up figures a bit
plt.rcParams['figure.dpi'] = 200

In [8]:
# Setup scale bar defaults
plt.rcParams['scalebar.location'] = 'lower right'
plt.rcParams['scalebar.frameon'] = False
plt.rcParams['scalebar.color'] = 'white'

In [9]:
# Display all plots identically
lines = 3
# And then do something like
# plt.subplot(lines, numpy.ceil(len(Data) / float(lines)), c + 1)

In [10]:
# Different locations if running either on Linux or Windows
FastSSD = True
overthere = False  # Load the data directly from the iee-research_storage drive
nanoct = True  # Load the data directly from the 2214
# to speed things up significantly
if 'Linux' in platform.system():
    if FastSSD:
        BasePath = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')
    elif overthere:
        BasePath = os.path.join(os.sep, 'home', 'habi', 'research-storage-iee')
    elif nanoct:
        BasePath = os.path.join(os.path.sep, 'home', 'habi', '2214')
    else:
        BasePath = os.path.join(os.sep, 'home', 'habi', '1272')
elif 'Darwin' in platform.system():
    FastSSD = False
    BasePath = os.path.join('/Users/habi/Dev/EAWAG/Data')
elif 'Windows' in platform.system():
    if FastSSD:
        BasePath = os.path.join('F:\\')
    else:
        if 'ana' in platform.node():
            BasePath = os.path.join('\\\\resstore.unibe.ch', 'iee_aqua', 'microCTupload')
        else:
            BasePath = os.path.join('D:\\Results')
if overthere:
    Root = BasePath
else:
    Root = os.path.join(BasePath, 'EAWAG')
print('We are loading all the data from %s' % Root)

We are loading all the data from /media/habi/Fast_SSD/EAWAG


In [11]:
def get_pixelsize(logfile):
    """Get the pixel size from the scan log file"""
    pixelsize = None
    with open(logfile, 'r') as f:
        for line in f:
            if 'Image Pixel' in line and 'Scaled' not in line:
                pixelsize = float(line.split('=')[1])
    return(pixelsize)

In [12]:
def get_projectionsize(logfile):
    """How big did we set the camera?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Number Of Rows' in line:
                y = int(line.split('=')[1])
            if 'Number Of Columns' in line:
                x = int(line.split('=')[1])
    return(x * y)

In [13]:
def get_voltage(logfile):
    """Get the x-ray voltage """
    with open(logfile, 'r') as f:
        for line in f:
            if '(kV)' in line:
                voltage = int(line.split('=')[1])
    return(voltage)

In [14]:
def get_current(logfile):
    """Get the x-ray voltage """
    with open(logfile, 'r') as f:
        for line in f:
            if '(uA)' in line:
                current = int(line.split('=')[1])
    return(current)

In [15]:
def get_filter(logfile):
    """Get the filter we used whole scanning from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Filter=' in line:
                whichfilter = line.split('=')[1].strip()
    return(whichfilter)

In [16]:
def get_exposuretime(logfile):
    """Get the exposure time size from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Exposure' in line:
                exposuretime = int(line.split('=')[1])
    return(exposuretime)

In [17]:
def get_ringartefact(logfile):
    """Get the ring artefact correction from the  scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Ring Artifact' in line:
                ringartefactcorrection = int(line.split('=')[1])
    return(ringartefactcorrection)

In [18]:
def get_reconstruction_grayvalue(logfile):
    grayvalue = None
    """How did we map the brightness of the reconstructions?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Maximum for' in line:
                grayvalue = float(line.split('=')[1])
    return(grayvalue)

In [19]:
def get_beamhardening(logfile):
    """Get the beamhardening correction from the  scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Hardening' in line:
                beamhardeningcorrection = int(line.split('=')[1])
    return(beamhardeningcorrection)

In [20]:
def get_rotationstep(logfile):
    """Get the rotation step from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Rotation Step' in line:
                rotstep = float(line.split('=')[1])
    return(rotstep)

In [21]:
def get_frameaveraging(logfile):
    """Get the frame averaging from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Averaging' in line:
                avg = line.split('=')[1]
    return(avg)

In [22]:
def get_machine(logfile):
    """Get the machine we used to scan"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Scanner' in line:
                machine = line.split('=')[1].strip()
    return(machine)

In [23]:
def get_scantime(logfile):
    """How long did we scan?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Scan duration' in line:
                time = line.split('=')[1].strip()
    return(pandas.to_timedelta(time))

In [24]:
def get_stacks(logfile):
    """How many stacks/connected scans did we make?"""
    stacks = 1
    with open(logfile, 'r') as f:
        for line in f:
            if 'conn' in line:
                stacks = int(line.split('=')[1])
    return(stacks)

In [25]:
def get_scandate(logfile, verbose=False):
    """When did we scan the fish?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Study Date and Time' in line:
                if verbose:
                    print('Found "date" line: %s' % line.strip())
                datestring = line.split('=')[1].strip().replace('  ', ' ')
                if verbose:
                    print('The date string is: %s' % datestring)
                date = pandas.to_datetime(datestring, format='%d %b %Y %Hh:%Mm:%Ss')
                if verbose:
                    print('Parsed to: %s' % date)
                (date)
    return(date.isoformat())

In [26]:
def get_git_hash():
    '''
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    '''
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git',
                        '--git-dir',
                        os.path.join(os.getcwd(), '.git'),
                        'rev-parse',
                        '--short',
                        '--verify',
                        'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [27]:
# # Make directory for output
# OutPutDir = os.path.join(os.getcwd(), 'Output', get_git_hash())
# print('We are saving all the output to %s' % OutPutDir)
# os.makedirs(OutPutDir, exist_ok=True)

In [28]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [29]:
# Get *all* log files, unsorted but fast
Data['LogFile'] = [os.path.join(root, name)
                   for root, dirs, files in os.walk(Root)
                   for name in files
                   if name.endswith((".log"))]

In [30]:
# # Let's try to optimize the timing, based on https://stackoverflow.com/a/27565420/323100
# n, t = 0, time.time()
# LogFiles = [os.path.join(root, name)
#             for root, dirs, files in os.walk(Root)
#             for name in files
#             if name.endswith((".log"))]
# t = time.time() - t
# print("os.walk: %.4fs, %d files found" % (t, len(LogFiles)))

# n, t = 0, time.time()
# globfiles = [f for f in glob.glob(os.path.join(Root, '**', '*.log'),
#                                   recursive=True)]
# t = time.time() - t
# print("glob.glob, unsorted: %.4fs, %d files found" % (t, len(globfiles)))

# n, t = 0, time.time()
# globfiles = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'),
#                                          recursive=True),
#                                key=os.path.getmtime)]
# t = time.time() - t
# print("glob.glob, sorted: %.4fs, %d files found" % (t, len(globfiles)))

In [31]:
# On the FastSSD, sorted glob is about half as fast as os.walk, and unsorted glob is still substantially slower than walk!
# glob.glob: 0.7773s, 1206 files found
# os.walk: 0.3810s, 1206 files found

In [32]:
if not len(Data):
    # Our dataframe is empty.
    # We might be running on Binder, e.g. load the logfiles from the subfolder in this repository
    print(10 * ' -', 'CAVEAT', 10 * ' -')
    print('You are most probably running the notebook on binder.')
    print('And thus do not have access to the log files on the research storage')
    print('We are using a "local" copy of the data in the `logfiles` subfolder')
    print('This gives correct, but possibly outdated results...')
    print(10 * ' -', 'CAVEAT', 10 * ' -')
    # Change root folder
    Root = 'logfiles'
    # Load log files again
    Data['LogFile'] = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'),
                                                   recursive=True),
                                         key=os.path.getmtime)]
    running_on_binder = True
else:
    running_on_binder = False

In [33]:
# Get all folders
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]

In [34]:
if not running_on_binder:
    # Check for samples which are not yet reconstructed
    for c, row in Data.iterrows():
        # Iterate over every 'proj' folder
        if 'proj' in row.Folder:
            if 'TScopy' not in row.Folder and 'PR' not in row.Folder:
                # If there's nothing with 'rec*' on the same level, then tell us
                if not glob.glob(row.Folder.replace('proj', 'rec')):
                    # print(glob.glob(row.Folder.replace('proj', 'rec')))
                    print('- %s is missing matching reconstructions' % row.LogFile[len(Root) + 1:])

- 103761/proj_oj/103761.log is missing matching reconstructions
- 104671_156645/proj/104671_156645.log is missing matching reconstructions
- 104671_156645/proj/104671_156645~00.log is missing matching reconstructions
- 104671_156645/proj/104671_156645~01.log is missing matching reconstructions
- 104671_156645/proj/104671_156645~02.log is missing matching reconstructions
- 105005_104015/proj/105005_104015.log is missing matching reconstructions
- 105005_104015/proj/105005_104015~00.log is missing matching reconstructions
- 105005_104015/proj/105005_104015~01.log is missing matching reconstructions
- 105005_104015/proj/105005_104015~02.log is missing matching reconstructions
- 105005_104015/proj/105005_104015~03.log is missing matching reconstructions
- 105005_104015/proj/105005_104015~04.log is missing matching reconstructions


In [35]:
# Get rid of all non-rec logfiles
for c, row in Data.iterrows():
    if 'rec' not in row.Folder:
        Data.drop([c], inplace=True)
    elif 'rectmp.log' in row.LogFile:
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# Generate us some meaningful colums
Data['Fish'] = [l[len(Root) + 1:].split(os.sep)[0] for l in Data['LogFile']]
Data['Scan'] = ['.'.join(l[len(Root) + 1:].split(os.sep)[1:-1]) for l in Data['LogFile']]

In [None]:
# Get parameters to doublecheck from logfiles
Data['Voxelsize'] = [get_pixelsize(log) for log in Data['LogFile']]
Data['Voltage'] = [get_voltage(log) for log in Data['LogFile']]
Data['Current'] = [get_current(log) for log in Data['LogFile']]
Data['Filter'] = [get_filter(log) for log in Data['LogFile']]
Data['Exposuretime'] = [get_exposuretime(log) for log in Data['LogFile']]
Data['Scanner'] = [get_machine(log) for log in Data['LogFile']]
Data['Averaging'] = [get_frameaveraging(log) for log in Data['LogFile']]
Data['ProjectionSize'] = [get_projectionsize(log) for log in Data['LogFile']]
Data['RotationStep'] = [get_rotationstep(log) for log in Data['LogFile']]
Data['CameraWindow'] = [round((ps ** 0.5) / 100) * 100 for ps in Data['ProjectionSize']]
Data['Grayvalue'] = [get_reconstruction_grayvalue(log) for log in Data['LogFile']]
Data['RingartefactCorrection'] = [get_ringartefact(log) for log in Data['LogFile']]
Data['BeamHardeningCorrection'] = [get_beamhardening(log) for log in Data['LogFile']]
Data['Scan date'] = [get_scandate(log) for log in Data['LogFile']]
Data['Scan time'] = [get_scantime(log) for log in Data['LogFile']]
Data['Stacks'] = [get_stacks(log) for log in Data['LogFile']]

In [38]:
# The iee research storage folder also has some scans which were done by Kassandra on a SkyScan1273
# Exclude those, since they are not 'in' this study
for c, row in Data.iterrows():
    if 'SkyScan1273' in row.Scanner:
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [39]:
# Sort dataframe on fishes and scans
Data.sort_values(by=['Fish', 'Scan'], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [40]:
# How many fishes did we scan?
# We scanned six 'BucketOfFish' and one set of only 'Teeth', so subtract those :)
print('We have %s unique names in our corpus of scans' % (len(Data.Fish.unique()) - 7))
print('We performed %s scans in total' % len(Data.Scan))

We have 130 unique names in our corpus of scans
We performed 368 scans in total


In [41]:
# Get the file names of the reconstructions
Data['Reconstructions'] = [[os.path.join(root, name)
                            for root, dirs, files in os.walk(f)
                            for name in files
                            if 'rec0' in name and name.endswith((".png"))] for f in Data['Folder']]
# Count how many files we have
Data['Number of reconstructions'] = [len(r) for r in Data.Reconstructions]

In [42]:
if not running_on_binder:
    # Let's see if we're missing some data
    for c, row in Data[Data['Number of reconstructions'] == 0].iterrows():
        print('%s/%s: Folder %s does not contain any reconstructions and '
              'will be removed in the next step' % (c + 1,
                                                    len(Data),
                                                    os.path.join(row.Fish, row.Scan)))

2/368: Folder 10151/rec does not contain any reconstructions and will be removed in the next step
3/368: Folder 10151/rec_oj does not contain any reconstructions and will be removed in the next step
4/368: Folder 10151/rec_pj does not contain any reconstructions and will be removed in the next step
6/368: Folder 103375/rec does not contain any reconstructions and will be removed in the next step
7/368: Folder 103375/rec_stuck does not contain any reconstructions and will be removed in the next step
9/368: Folder 103571/rec does not contain any reconstructions and will be removed in the next step
10/368: Folder 103634/rec does not contain any reconstructions and will be removed in the next step
12/368: Folder 103635/jaw_rec does not contain any reconstructions and will be removed in the next step
13/368: Folder 103635/pharynx_rec does not contain any reconstructions and will be removed in the next step
14/368: Folder 103637/rec does not contain any reconstructions and will be removed in

In [43]:
# 103761/rec_rereconstruct is a folder where we tried to salvage a scan where the sample holder touched the source
# MA31/moved_rec/ is a folder where the fish moved during the acquisition
# MA31/stuck_rec/ is a folder where we've lost air pressure in the building and the stage got stuck

In [44]:
print('We have %s folders in total' % (len(Data)))
if not running_on_binder:
    # Drop samples which have not been reconstructed yet
    # Based on https://stackoverflow.com/a/13851602
    # for c, row in Data.iterrows():
    #     if not row['Number of reconstructions']:
    #         print('%s contains no PNG files, we might be currently reconstructing it' % row.Folder)
    Data = Data[Data['Number of reconstructions'] > 0]
    Data.reset_index(drop=True, inplace=True)
    print('Of which %s folders do contain reconstructions' % (len(Data)))

We have 368 folders in total
Of which 111 folders do contain reconstructions


In [45]:
Data['Scan time total'] = [st * stk for st, stk in zip(Data['Scan time'], Data['Stacks'])]

In [46]:
# Show five smallest voxelsizes and scans
for c, vs in enumerate(sorted(Data.Voxelsize.unique())[:5]):
    print('-----vs: %s-----' % vs)
    print(Data[Data.Voxelsize == vs][['Fish', 'Scan', 'Voxelsize']])

-----vs: 4.399916-----
      Fish      Scan  Voxelsize
38  106985  2214_rec   4.399916
-----vs: 5.000018-----
    Fish Scan  Voxelsize
96  IG92  rec   5.000018
-----vs: 5.001124-----
      Fish Scan  Voxelsize
39  106985  rec   5.001124
-----vs: 5.499452-----
      Fish    Scan  Voxelsize
19  104621     rec   5.499452
97    IG92  rec_oj   5.499452
98    IG92  rec_pj   5.499452
-----vs: 5.999331-----
      Fish      Scan  Voxelsize
12  103908  head_rec   5.999331


In [47]:
# Show five largest voxelsizes and scans
for c, vs in enumerate(sorted(Data.Voxelsize.unique())[-5:]):
    print('-----vs: %s-----' % vs)
    print(Data[Data.Voxelsize == vs][['Fish', 'Scan', 'Voxelsize']])

-----vs: 35.998895-----
     Fish      Scan  Voxelsize
68  13115  head_rec  35.998895
-----vs: 39.999499-----
     Fish                 Scan  Voxelsize
30  10619  head_rec_4xbin_40um  39.999499
-----vs: 40.000954-----
     Fish           Scan  Voxelsize
62  11965  rec_head_40um  40.000954
-----vs: 49.998527-----
     Fish           Scan  Voxelsize
65  12319  head_50um_rec  49.998527
-----vs: 188.212-----
     Fish            Scan  Voxelsize
32  10628  full_188um_rec    188.212


In [48]:
Data.Filter.unique()

array(['No Filter', 'Al 0.5mm', 'Al 1mm', 'Al 0.25mm'], dtype=object)

In [49]:
sorted(Data.Voltage.unique())

[55, 60, 65, 70, 80]

In [50]:
sorted(Data.Current.unique())

[115, 125, 134, 135, 136, 137, 138, 139, 140, 142, 145, 166, 200]

In [51]:
sorted(Data.RingartefactCorrection.unique())

[0, 13, 14]

In [52]:
sorted(Data.BeamHardeningCorrection.unique())

[0]

In [53]:
# Generate a text file in each rec-folder, in which we can note what's going on with the fish
# Generate filename
for c, row in Data.iterrows():
    Data.at[c, 'CommentFile'] = os.path.join(os.path.dirname(row.Folder),
                                             row.Fish + '.' + row.Scan + '.md')
# Create actual file on disk
for c, row in Data.iterrows():
    # Only do this if the file does not already exist
    if not os.path.exists(row.CommentFile):
        with open(row.CommentFile, 'w', encoding='utf-8') as f:
            f.write('# Fish %s, Scan %s\n\n' % (row.Fish, row.Scan))
            f.write('This fish was scanned on %s on the %s, with a voxel size of %s μm.\n\n'
                    % (row['Scan date'], row.Scanner, numpy.round(row.Voxelsize, 2)))
            f.write('## Comments')

In [54]:
# # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
# columns = list(Data)
# columns.remove('Folder')
# columns.remove('Fish')
# columns.remove('LogFile')
# columns.remove('Reconstructions')
# columns.remove('Number of reconstructions')
# columns.remove('Grayvalue')
# columns.remove('Scan time')
# columns.remove('Scan time total')
# columns.remove('Scan date')
# print(columns)
# for col in columns:
#     print(col)
#     print(Data[col].unique())
#     print(80 * '-')

In [55]:
# Data[['Fish', 'Scan',
#       'Voxelsize', 'Scanner',
#       'Scan date', 'CameraWindow', 'RotationStep', 'Averaging',
#       'Scan time', 'Stacks', 'Scan time total']]

In [56]:
# Get an overview over the total scan time
# Nice output based on https://stackoverflow.com/a/8907407/323100
total_seconds = int(Data['Scan time total'].sum().total_seconds())
hours, remainder = divmod(total_seconds, 60 * 60)
minutes, seconds = divmod(remainder, 60)
print('In total, we scanned for %s hours and %s minutes)' % (hours, minutes))
for machine in Data['Scanner'].unique():
    total_seconds = int(Data[Data['Scanner'] == machine]['Scan time total'].sum().total_seconds())
    hours, remainder = divmod(total_seconds, 60 * 60)
    minutes, seconds = divmod(remainder, 60)
    print('\t - Of these, we scanned %s hours and %s minutes on the %s,'
          ' for %s scans' % (hours,
                             minutes,
                             machine,
                             len(Data[Data['Scanner'] == machine])))

In total, we scanned for 239 hours and 56 minutes)
	 - Of these, we scanned 215 hours and 27 minutes on the SkyScan2214, for 108 scans
	 - Of these, we scanned 24 hours and 29 minutes on the SkyScan1272, for 3 scans


In [57]:
print('We scanned %0.f fishes' % (len(Data.Fish.unique()) - 7))

We scanned 80 fishes


In [58]:
print('We did a total of %s scans' % len(Data))

We did a total of 111 scans


In [59]:
print('We perfomed %s scans with "head" in their folder name' % len(Data[Data['Scan'].str.contains('head')]))

We perfomed 96 scans with "head" in their folder name


In [60]:
Data[['Fish', 'Scan', 'LogFile',
      'Voxelsize', 'Scanner',
      'Scan date', 'CameraWindow',
      'RotationStep', 'Averaging',
      'Scan time', 'Stacks']].to_excel('Details.xlsx')

In [61]:
if not running_on_binder:
    Data[['Fish', 'Scan',
          'Voxelsize', 'Scanner',
          'Scan date', 'CameraWindow',
          'RotationStep', 'Averaging',
          'Scan time', 'Stacks']].to_excel(os.path.join(Root, 'Details.xlsx'))

In [62]:
# Save 'data' file for manuscript: github.com/habi/eawag-manuscript
# Since the manuscript is in a subfolder, we can simply write the output there
if not running_on_binder:
    Data[['Fish', 'Scan', 'Scanner', 'Scan date',
          'Voxelsize', 'Voltage', 'Current', 'Filter', 'Exposuretime', 'Averaging',
          'RotationStep', 'ProjectionSize', 'CameraWindow', 'Scan time', 'Stacks', 'Scan time total',
          'RingartefactCorrection', 'BeamHardeningCorrection', 'Grayvalue',
          ]].to_csv(os.path.join('manuscript', 'content', 'data', 'ScanningDetails.csv'))

In [65]:
if not running_on_binder:
    # Read Mikkis datafile
    MikkisFile = sorted(glob.glob(os.path.join(Root, 'X_ArchiveFiles', '*CTscanFishList.xlsx')))[0]
    # Read excel file and use the first column as index
    print('Reading in %s' % MikkisFile)
    DataMikki = pandas.read_excel(MikkisFile)

Reading in /media/habi/Fast_SSD/EAWAG/X_ArchiveFiles/2022.12.13_CTscanFishList.xlsx


In [70]:
if not running_on_binder:
    DataMikki.head()

In [71]:
DataMikki

Unnamed: 0,Fishec,FieldID,OtherID,ReplacementID,Fishec_ScannedSpecimens,Length(cm),TemporaryJar,Genus,Species,Ecology,...,UpperOralJaw,LowerOralJaw,PharyngealJawScan,UpperPharyngealJaw,LowerPharyngealJaw,ScanComments,QualityChecked,Unnamed: 21,SpecimenReturned,Comments
0,130786,,,need replacement?,3221 - head/jaws - Houston,7.25,,Prognathochromis,perrieri,piscivore,...,load and enter info,,yes,load and enter info,,big underbite,,,,we need to scan a nice large aquarium stock fi...
1,Ppell,,,Prognathochromis. cf. pellegrini (WGS spreadsh...,3144 - head/jaws - Houston,13.25,,Pyxichromis,orthostoma,piscivore,...,load and enter info,,yes,load and enter info,,"Pyxichromis orthostoma (UPDATED SPECIES NAME, ...",,,,
2,22406,Ole: male,,,22406,,Mark1,Macropleurodus super-lineage (Platytaeniodus),degeni,oral sheller,...,load and enter info,,yes,load and enter info,,wide gape,,,,
3,130785,,,,12194 - head/jaws - Houston,10,,Lipochromis,matumbi hunter,paedophage,...,load and enter info,,yes,load and enter info,,,,,,
4,21322,,,,testfish,,testfish,Neochromis,omnicaeruleus,epilithic algae scraper,...,,,yes in 21um,,,test fish,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,103718,,,,,,,Enterochromis I,paropius,,...,,,,,,,,,,
144,103641,,,,,,,"""Astatotilapia""",nubila swamp red,,...,,,,,,,,,,
145,103637,,,,,,,"""Astatotilapia""",nubila swamp blue,,...,,,,,,,,,,
146,103634,,,,,,,"""Astatotilapia""",nubila swamp blue,,...,,,,,,,,,,


In [72]:
# Find the fish we look at and display all the info we know about it
# Set a substring you're looking for to the variable below
# In which jar can we find it?
fish = '104061'

In [84]:
if not running_on_binder:
    # In which jar should it be/go?
    foundfishes = 0
    for d, row in DataMikki.iterrows():
        if (str(fish).lower() in str(row.Fishec).lower()) \
        or (str(fish).lower() in str(row.FieldID).lower()) \
        or (str(fish).lower() in str(row.OtherID).lower()) \
        or (str(fish).lower() in str(row.ReplacementID).lower()):
            foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
            # remove nan from the list of hits
            foundfishes = [str(x).lower() for x in foundfishes if not pandas.isnull(x)]
            print('*%s*: The fish ' % fish, end='')
            if len(foundfishes) > 1:
                for found in foundfishes:
                    print(found.upper(), end='/')
            else:
                print(foundfishes[0].upper(), end='')
            print(' should now go in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                                  row['TemporaryJar']))
    if not foundfishes:
        print('*%s*: Nothing found in %s' % (fish, MikkisFile))

*AN33*: The fish 78773/AN33/ should now go in jar "length=11 cm" (Mark3))


In [74]:
if not running_on_binder:
    # Do we have something from this fish on disk?
    ondisk = glob.glob(os.path.join(Root, '*%s*' % fish))
    if len(ondisk):
        for found in ondisk:
            print('*%s*: Found on disk in %s' % (fish, found))
            foundondisk = 1
    else:
        print('*%s*: Nothing found in %s' % (fish, Root))
        foundondisk = 0

*104061*: Found on disk in /media/habi/Fast_SSD/EAWAG/104061


In [75]:
if not running_on_binder:
    # Did we scan it already?
    found = 0
    for c, row in Data.iterrows():
        if fish in row.Fish:
            print('*%s*: Sample %s/%s was scanned on %s' % (fish, row['Fish'], row['Scan'], row['Scan date']))
            found = 1
    if not found:
        if foundondisk:
            print('*%s*: We have a folder (%s) for this sample, but nothing in the dataframe, so it probably is all good' % (fish, ondisk[0]))
            print('Check the folder to be shure')
        else:
            print('*%s*: Nothing about this sample is found in our dataframe' % fish)

*104061*: Sample 104061/head_rec was scanned on 2022-02-18T10:03:36


In [76]:
# Can we find it in FullHeadList.txt?
def findinFullHeadList(sample):
    ''' Look for the sample in the FullHeadList.txt file'''
    fullheadlist = glob.glob(os.path.join(Root, 'FullHeadList.*'))[0]
    found = 0
    with open(fullheadlist, 'r') as f:
        for line in f:
            if str(sample) in line:
                print(line.strip())
                found = 1
    if not found:
        return('*%s*: Nothing found in %s' % (sample, fullheadlist))
    else:
        return(None)

In [77]:
if not running_on_binder:
    findinFullHeadList(fish)

104061, Labrochromis sp. "stone" (pharyngeal mollusc crusher), head cropped


In [78]:
if not running_on_binder:
    # Do we need to rescan this fish
    # Find all relevant comment files
    commentfiles = glob.glob(os.path.join(Root, '*%s*' % fish, '**', '*.md'), recursive=True)
    print('We found these comment files in our dataframe')
    for c, row in Data.iterrows():
        if fish in row.Fish:
            print('\t-', row.CommentFile)
            found = 1
    print(80 * '-')
    if len(commentfiles):
        for commentfile in commentfiles:
            print('-', commentfile)
            print(10 * '-')
            with open(commentfile, 'r', encoding='utf-8') as file:
                for line in file:
                    print(line.strip())
                    if 'rescan' in line:
                        print('BEEEEP!')
            print(80 * '-')

We found these comment files in our dataframe
	- /media/habi/Fast_SSD/EAWAG/104061/head/104061.head_rec.md
--------------------------------------------------------------------------------
- /media/habi/Fast_SSD/EAWAG/104061/104061.rec.md
----------
# Fish 104061, Scan rec

This fish was scanned on 2021-07-15T15:23:55 on the SkyScan2214, with a voxel size of 8.95 μm.

## Comments
--------------------------------------------------------------------------------
- /media/habi/Fast_SSD/EAWAG/104061/104061.rec_rescan.md
----------
# Fish 104061, Scan rec_rescan
BEEEEP!

This fish was scanned on 2021-08-20T11:06:31 on the SkyScan2214, with a voxel size of 10.0 μm.

## Comments
--------------------------------------------------------------------------------
- /media/habi/Fast_SSD/EAWAG/104061/head/104061.head_rec.md
----------
# Fish 104061, Scan head_rec

This fish was scanned on 2022-02-18T10:03:36 on the SkyScan2214, with a voxel size of 29.0 μm.

## f
--------------------------------------

60 of the fishes need complete head scans.
Let's try to go through Mikkis/Kassandras list and see how far we progressed through that list.

In [79]:
if not running_on_binder:
    # Read in full head list, go through all the scans we alredy did and see what needs to be done
    fullheadlist = glob.glob(os.path.join(Root, '*Head*.txt'))[0]
    HeadsToBeScanned = []
    with open(fullheadlist, 'r', encoding='utf-8') as file:
        headdone = False
        for ln, line in enumerate(file):
            if line.strip():  # skip empty lines
                # The first 'item' on the line should be the fish ID
                fish = line.strip().split()[0].replace(',', '').upper()
                # Let's ignore some lines which don't start with a fish ID
                # The set-join here removes duplicate characters from the string (e.g. =====, !! and ::)
                if len(''.join(set(fish))) > 2:
                    for c, row in Data[Data.Fish == fish].iterrows():
                        if 'head' in row.Scan:
                            # print('\t%s has a head-scan' % row.Fish)
                            # print('%s has a head-scan on disk, and is found on line %s of the full head list' % (fish, ln + 1))
                            headdone = True
                        else:
                            headdone = False
                    # At this point we have either found the fish in the list or 'headdone' is false
                    if not headdone:
                        print('%s is missing a head-scan on disk, but is found on line %s of the full head list' % (fish, ln + 1))
                        HeadsToBeScanned.append(fish)

WHERE is missing a head-scan on disk, but is found on line 12 of the full head list
WHERE is missing a head-scan on disk, but is found on line 13 of the full head list
104621 is missing a head-scan on disk, but is found on line 35 of the full head list
2801 is missing a head-scan on disk, but is found on line 44 of the full head list
2800 is missing a head-scan on disk, but is found on line 45 of the full head list
N/A is missing a head-scan on disk, but is found on line 46 of the full head list
N/A is missing a head-scan on disk, but is found on line 47 of the full head list
11447 is missing a head-scan on disk, but is found on line 54 of the full head list
104671 is missing a head-scan on disk, but is found on line 56 of the full head list
AN33 is missing a head-scan on disk, but is found on line 105 of the full head list


In [80]:
if not running_on_binder:
    # Fish 10448 can be ignored because we did another scan after the head-scan, so we reset "headdone" in the loop above
    # We could probably do it in a more clever way, but already spent too much time on this part :)
    try:
        HeadsToBeScanned.remove('10448')
        # HeadsToBeScanned.remove('105515')
    except ValueError:
        # Nothing to see here, pass along
        pass

In [81]:
if not running_on_binder:
    for fish in HeadsToBeScanned:
        # In which jar should we look for the fishes we still need to scan the head of?
        foundfishes = 0
        for d, row in DataMikki.iterrows():
            if (str(fish).lower() in str(row.Fishec).lower()) \
            or (str(fish).lower() in str(row.FieldID).lower()) \
            or (str(fish).lower() in str(row.OtherID).lower()) \
            or (str(fish).lower() in str(row.ReplacementID).lower()):
                foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
                # remove nan from the list of hits
                foundfishes = [str(x).lower() for x in foundfishes if not pandas.isnull(x)]
                print('*%s*: A fish called ' % fish, end='')
                if len(foundfishes) > 1:
                    for found in foundfishes:
                        print(found.upper(), end='/')
                else:
                    print(foundfishes[0].upper(), end='')
                print(' should be found in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                                        row['TemporaryJar']))
        if not foundfishes:
            print('*%s*: Nothing found in %s' % (fish, MikkisFile))

*WHERE*: Nothing found in /media/habi/Fast_SSD/EAWAG/X_ArchiveFiles/2022.12.13_CTscanFishList.xlsx
*WHERE*: Nothing found in /media/habi/Fast_SSD/EAWAG/X_ArchiveFiles/2022.12.13_CTscanFishList.xlsx
*104621*: A fish called 104621 should be found in jar "length=7.5 cm" (Mark1))
*2801*: A fish called 2801/NOT YET FOUND?/ should be found in jar "length=11.25 cm" (nan))
*2800*: A fish called 2800/NOT YET FOUND?/ should be found in jar "length=8 cm" (nan))
*N/A*: Nothing found in /media/habi/Fast_SSD/EAWAG/X_ArchiveFiles/2022.12.13_CTscanFishList.xlsx
*N/A*: Nothing found in /media/habi/Fast_SSD/EAWAG/X_ArchiveFiles/2022.12.13_CTscanFishList.xlsx
*11447*: A fish called 11447 should be found in jar "length=7.9 cm" (Mark2))
*104671*: A fish called 158571/104671/ should be found in jar "length=7.5 cm" (Mark5))
*AN33*: A fish called 78773/AN33/ should be found in jar "length=11 cm" (Mark3))


In [82]:
# Some of the reconstructions need to be looked at?
# Mikki wrote something about this into the files.
# Get a list of *all* comment files
CommentFiles = glob.glob(os.path.join(Root, '**', '*.md'), recursive=True)

In [83]:
# Read what we want
print('Going through all the %s comments files we find' % len(CommentFiles))
for c, cf in enumerate(CommentFiles):
    with open(cf, 'r', encoding='utf-8') as file:
        for line in file:
            if 'Mikki' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))
            elif 'ML' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))
            elif 'realign' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))

Going through all the 378 comments files we find
  0/378: 10151/10151.rec.md: 08.02.2022 - ML - Quality check PNG uploaded to the folder. The artifacts are pretty bad, not sure if re alignment will fix it entirely
  1/378: 10151/10151.rec_oj.md: 2022.08.12 ML - OJ has many artifacts, please do an alignment check
 26/378: 103754/103754.rec_pj.md: 2022.08.12 ML: this rec_pj scan does not contain a full pharyngeal jaw.
 29/378: 103761/103761.rec.md: ML 16.11.2021:  OJ and PJ need rescan at lower voxel where possible.
 30/378: 103761/103761.rec_oj_2.md: 2022.08.13 ML - The upper right jaw teeth are cropped. Can this be salvaged from a re reconstruction of rec_oj_2?
 37/378: 103767/103767.rec.md: ML 16.11.2021 - Tried re reconstructing, OJ has artifacts connected the jaws. Please rescan just the OJ at lower voxel size if possible.
 61/378: 10448/10448.rec.md: - According to Mikki "try re reconstructing to better later segmentation, OJ artifacts, PJ is good", so we 'only' optimize on oral ja

In [None]:
Data.sort_values(['Scan date'], ascending=False, inplace=True)