# Handle and check the 'data' of the all the scans we did
Wrestle with the data, check parameters and generate some helping files

In [1]:
import platform
import os
import glob
import pandas
import imageio
import numpy
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn
import dask
import dask_image.imread
from dask.distributed import Client, LocalCluster
from numcodecs import Blosc
import skimage
from tqdm import notebook

In [2]:
# Set dask temporary folder
# Do this before creating a client: https://stackoverflow.com/a/62804525/323100
import tempfile
if 'Linux' in platform.system():
    tmp = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')
elif 'Darwin' in platform.system():
    tmp = tempfile.gettempdir()
else:
    if 'anaklin' in platform.node():
        tmp = os.path.join('F:\\')
    else:
        tmp = os.path.join('D:\\')
dask.config.set({'temporary_directory': os.path.join(tmp, 'tmp')})
print('Dask temporarry files go to %s' % dask.config.get('temporary_directory'))

Dask temporarry files go to /media/habi/Fast_SSD/tmp


In [3]:
# Start cluster and client now, after setting tempdir
cluster = LocalCluster(n_workers=8)
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44045 instead


In [4]:
print('You can seee what DASK is doing at "http://localhost:%s/status"' % client.scheduler_info()['services']['dashboard'])

You can seee what DASK is doing at "http://localhost:44045/status"


In [5]:
# # Ignore warnings in the notebook
# import warnings
# warnings.filterwarnings("ignore")

In [6]:
# Set up figure defaults
plt.rc('image', cmap='gray', interpolation='nearest')  # Display all images in b&w and with 'nearest' interpolation
plt.rcParams['figure.figsize'] = (16, 9)  # Size up figures a bit
plt.rcParams['figure.dpi'] = 200

In [7]:
# Setup scale bar defaults
plt.rcParams['scalebar.location'] = 'lower right'
plt.rcParams['scalebar.frameon'] = False
plt.rcParams['scalebar.color'] = 'white'

In [8]:
# Display all plots identically
lines = 3
# And then do something like
# plt.subplot(lines, numpy.ceil(len(Data) / float(lines)), c + 1)

In [11]:
# Different locations if running either on Linux or Windows
Archive = False # Load the data directly from the iee-research_storage drive
# to speed things up significantly
if Archive:
    if 'Linux' in platform.system():
        BasePath = os.path.join(os.sep, 'home', 'habi', 'research-storage-uct', 'Archiv_Tape')
    elif 'Windows' in platform.system():
        BasePath = os.path.join('R:\\Archiv_Tape')
else:
    BasePath = os.path.join(os.getcwd(), 'Data')
Root = os.path.join(BasePath, 'Liver-Semela')
print('We are loading all the data from %s' % Root)

We are loading all the data from /home/habi/P/Documents/Semela-Liver/Data/Liver-Semela


In [12]:
def get_pixelsize(logfile):
    """Get the pixel size from the scan log file"""
    pixelsize=None    
    with open(logfile, 'r') as f:
        for line in f:
            if 'Image Pixel' in line and 'Scaled' not in line:
                pixelsize = float(line.split('=')[1])
    return(pixelsize)

In [13]:
def get_projectionsize(logfile):
    """How big did we set the camera?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Number Of Rows' in line:
                y = int(line.split('=')[1])
            if 'Number Of Columns' in line:
                x = int(line.split('=')[1])                
    return(x*y)

In [14]:
def get_filter(logfile):
    """Get the filter we used whole scanning from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Filter=' in line:
                whichfilter = line.split('=')[1].strip()
    return(whichfilter)

In [15]:
def get_exposuretime(logfile):
    """Get the exposure time size from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Exposure' in line:
                exposuretime = int(line.split('=')[1])
    return(exposuretime)

In [16]:
def get_ringartefact(logfile):
    """Get the ring artefact correction from the  scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Ring Artifact' in line:
                ringartefactcorrection = int(line.split('=')[1])
    return(ringartefactcorrection)

In [17]:
def get_reconstruction_grayvalue(logfile):
    grayvalue = None
    """How did we map the brightness of the reconstructions?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Maximum for' in line:
                grayvalue = float(line.split('=')[1])
    return(grayvalue)

In [18]:
def get_beamhardening(logfile):
    """Get the beamhardening correction from the  scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Hardening' in line:
                beamhardeningcorrection = int(line.split('=')[1])
    return(beamhardeningcorrection)

In [19]:
def get_rotationstep(logfile):
    """Get the rotation step from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Rotation Step' in line:
                rotstep = float(line.split('=')[1])
    return(rotstep)

In [20]:
def get_frameaveraging(logfile):
    """Get the frame averaging from the scan log file"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Averaging' in line:
                avg = line.split('=')[1]
    return(avg)

In [21]:
def get_machine(logfile):
    """Get the machine we used to scan"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Scanner' in line:
                machine = line.split('=')[1].strip()
    return(machine)

In [46]:
def get_operator(logfile):
    """Get the operator who scanned the samples"""
    operator = None
    with open(logfile, 'r') as f:
        for line in f:
            if 'User Name' in line:
                operator = line.split('=')[1].strip()
    return(operator)

In [84]:
def get_experiment(i):
    '''Categorize  into 'Notch' or 'Control' '''
    if 'notch' in i:
        return 'Notch'
    if 'ctrl' in i:
        return 'Control'

In [22]:
def get_scantime(logfile):
    """How long did we scan?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Scan duration' in line:
                time = line.split('=')[1].strip()
    return(pandas.to_timedelta(time))

In [23]:
def get_stacks(logfile):
    """How many stacks/connected scans did we make?"""
    stacks = 1
    with open(logfile, 'r') as f:
        for line in f:
            if 'conn' in line:
                stacks = int(line.split('=')[1])
    return(stacks)

In [24]:
def get_scandate(logfile, verbose=False):
    """When did we scan the fish?"""
    with open(logfile, 'r') as f:
        for line in f:
            if 'Study Date and Time' in line:
                if verbose:
                    print('Found "date" line: %s' % line.strip())
                datestring = line.split('=')[1].strip().replace('  ', ' ')
                if verbose:
                    print('The date string is: %s' % datestring)
                date = pandas.to_datetime(datestring , format='%d %b %Y %Hh:%Mm:%Ss')
                if verbose:
                    print('Parsed to: %s' % date)
                (date)
    return(date.isoformat())

In [25]:
def get_git_hash():
    '''
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    '''
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git',
                        '--git-dir',
                        os.path.join(os.getcwd(), '.git'),
                        'rev-parse',
                        '--short',
                        '--verify',
                        'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [26]:
# # Make directory for output
# OutPutDir = os.path.join(os.getcwd(), 'Output', get_git_hash())
# print('We are saving all the output to %s' % OutPutDir)
# os.makedirs(OutPutDir, exist_ok=True)

Mario Novkovic told us that 
> We have used the ds17 livers in the paper, specifically ctrl4 and notch1_2 in the first batch (training dataset), while the second batch consisted of 3 datasets from each mouse type: ctrl1, ctrl2, ctrl5 and notch1_1, notch1_3, notch1_4.

So let's only use *those* folders for the remainder of the notebook.
We copied all the relevant data from the archive to the `Data`-subfolder here with
````bash
rsync --verbose --recursive --times --update --omit-dir-times --include="*/" --include="*.?og" --include="*.c?v" --include="*.?oi" --include="*.?at" --include="*_spr*.bmp" --include="*.txt" --include="*.md" --include="*.mp" --include="*.sb" --include="*.info" --include="*.?nc" --include="*.bkp" --exclude="*" ~/research-storage-uct/Archiv_Tape/Liver-Semela/ /home/habi/P/Documents/Semela-Liver/Data/

````
(which is our standard `rsync` blurb for putting stuff *to* the archive, but without the `*.tif` files, so we get back all the relevant things :).

We then delete all non-`ds17*`-folders with
````bash
find . -path './ds*' -prune -o -name '*' -delete -depth
````
(which does issue a warning because of `-depth`, but leaves us with only the `ds17_*`-folders :) )

In [60]:
# These are the folders that were used according to Mario.
whichones = ['ctrl4', 'notch1_2', 'ctrl1', 'ctrl2', 'ctrl5', 'notch1_1', 'notch1_3', 'notch1_4']

Now that we have *all* the necessary data (and some more), let's get to work!

In [55]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [56]:
# Get *all* log files
Data['LogFile'] = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'), recursive=True))]

In [96]:
# Get all folders and generate sample, scan and experiment name
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]
Data['Sample'] = [f[len(Root):].split(os.path.sep)[1].replace('ds17_','') for f in Data['Folder']]
Data['Scan'] = [f[len(Root):].split(os.path.sep)[2] for f in Data['Folder']]
Data['Subfolder'] = [f[len(Root):].split(os.path.sep)[3] for f in Data['Folder']]
Data['Experiment'] = [get_experiment(s) for s in Data['Sample']]

In [97]:
# Check what we did there...
for i in Data.iloc[33]:
    print(i)

/home/habi/P/Documents/Semela-Liver/Data/Liver-Semela/ds17_notch1_1/highresolution/proj/ds17_notch1_1~00.log
/home/habi/P/Documents/Semela-Liver/Data/Liver-Semela/ds17_notch1_1/highresolution/proj
Notch
notch1_1
highresolution
proj


In [98]:
Data

Unnamed: 0,LogFile,Folder,Experiment,Sample,Scan,Subfolder
0,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal,highresolution,proj
1,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal,highresolution,proj
2,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal,highresolution,proj
3,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal,highresolution,rec
4,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal_rescan,highresolution,proj
5,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal_rescan,highresolution,proj
6,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal_rescan,highresolution,proj
7,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal_rescan,highresolution,rec
8,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl2_portal_rescan,highresolution,proj
9,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl2_portal_rescan,highresolution,proj


In [91]:
# Get rid of all non-'highresolution' scans
for c, row in Data.iterrows():
    if 'high' not in row.Scan:
        Data.drop([c], inplace=True)
# Reset dataframe to something that we would get if we only would have loaded the 'rec' files
Data = Data.reset_index(drop=True)

In [104]:
# Show us a subset of the data
Data.sample(n=5)

Unnamed: 0,LogFile,Folder,Experiment,Sample,Scan,Subfolder
19,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl5_cava,highresolution,proj_20um_1172
2,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal,highresolution,proj
4,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl1_portal_rescan,highresolution,proj
38,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Notch,notch1_1_rescan,highresolution,proj
44,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Notch,notch1_3_rescan,highresolution,rec


In [111]:
# What do we have?
for i in Data.Sample.unique():
    print(i)

ctrl1_portal
ctrl1_portal_rescan
ctrl2_portal_rescan
ctrl4_cava
ctrl5_cava
ctrl5_cava_rescan
notch1_1
notch1_1_rescan
notch1_3_rescan
notch1_4
notch1_4_rescan


In [113]:
for i in sorted(Subset):
    print(i)

ctrl1
ctrl2
ctrl4
ctrl5
notch1_1
notch1_2
notch1_3
notch1_4


So it seems that Mario et al. simply used the `highresolution` scans.

In [None]:
# What kind of subfolders do we have?
for sample in Data.Sample.unique():
    print(Data[Data['Sample'] == sample]['Subfolder'].unique())

In [115]:
Data['Operator'] = [get_operator(log) for log in Data['LogFile']]
print(Data.Operator.unique())

['haberthu']


All scanned by me :)

In [124]:
# Get parameters to doublecheck from logfiles
Data['Voxelsize'] = [get_pixelsize(log) for log in Data['LogFile']]
Data['Filter'] = [get_filter(log) for log in Data['LogFile']]
Data['Exposuretime'] = [get_exposuretime(log) for log in Data['LogFile']]
Data['Scanner'] = [get_machine(log) for log in Data['LogFile']]
Data['Averaging'] = [get_frameaveraging(log) for log in Data['LogFile']]
# Data['ProjectionSize'] = [get_projectionsize(log) for log in Data['LogFile']]
Data['RotationStep'] = [get_rotationstep(log) for log in Data['LogFile']]
# Data['CameraWindow'] = [round((ps ** 0.5)/100)*100  for ps in Data['ProjectionSize']]
Data['Grayvalue'] = [get_reconstruction_grayvalue(log) for log in Data['LogFile']]
# Data['RingartefactCorrection'] = [get_ringartefact(log) for log in Data['LogFile']]
# Data['BeamHardeningCorrection'] = [get_beamhardening(log) for log in Data['LogFile']]
# Data['Scan date'] = [get_scandate(log) for log in Data['LogFile']]
Data['Scan time'] = [get_scantime(log) for log in Data['LogFile']]
Data['Stacks'] = [get_stacks(log) for log in Data['LogFile']]
Data['Scan time total'] = [ st * stk  for st, stk in zip(Data['Scan time'], Data['Stacks'])]

In [125]:
Data.sample(n=5)

Unnamed: 0,LogFile,Folder,Experiment,Sample,Scan,Subfolder,Operator,Voxelsize,Filter,Exposuretime,Scanner,Averaging,RotationStep,Grayvalue,Scan time,Stacks,Scan time total
9,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl2_portal_rescan,highresolution,proj,haberthu,5.000018,Cu 0.11mm,4400,SkyScan1272,ON (3)\n,0.1,0.033987,0 days 06:57:02,3,0 days 20:51:06
10,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl2_portal_rescan,highresolution,proj,haberthu,5.000018,Cu 0.11mm,4400,SkyScan1272,ON (3)\n,0.1,,0 days 06:57:02,3,0 days 20:51:06
28,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl5_cava_rescan,highresolution,proj,haberthu,5.000018,Cu 0.11mm,4400,SkyScan1272,ON (3)\n,0.1,,0 days 06:57:02,2,0 days 13:54:04
30,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Control,ctrl5_cava_rescan,highresolution,rec,haberthu,5.000018,Cu 0.11mm,4400,SkyScan1272,ON (3)\n,0.1,0.115389,0 days 06:57:02,2,0 days 13:54:04
50,/home/habi/P/Documents/Semela-Liver/Data/Liver...,/home/habi/P/Documents/Semela-Liver/Data/Liver...,Notch,notch1_4_rescan,highresolution,proj,haberthu,5.000018,Cu 0.11mm,4400,SkyScan1272,ON (3)\n,0.1,,0 days 06:57:02,2,0 days 13:54:04


In [126]:
asdfasdf==

SyntaxError: invalid syntax (263294990.py, line 1)

In [36]:
# Generate a text file for each rec-folder, in which we can note what's going on with the fish
# Generate filename
for c,row in Data.iterrows():
    Data.at[c, 'CommentFile'] = os.path.join(os.path.dirname(row.Folder),
                                              row.Fish + '.' + row.Scan + '.md')
# Create actual file on disk
for c,row in Data.iterrows():
    # Only do this if the file does not already exist
    if not os.path.exists(row.CommentFile):
        with open(row.CommentFile, 'w', encoding='utf-8') as f:
            f.write('# Fish %s, Scan %s\n\n' % (row.Fish, row.Scan))
            f.write('This fish was scanned on %s on the %s, with a voxel size of %s μm.\n\n'
                    % (row['Scan date'], row.Scanner, numpy.round(row.Voxelsize, 2)))
            f.write('## f')

In [37]:
# # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
# columns = list(Data)
# columns.remove('Folder') 
# columns.remove('Fish')
# columns.remove('LogFile')
# columns.remove('Reconstructions')
# columns.remove('Number of reconstructions')
# columns.remove('Grayvalue')
# columns.remove('Scan time')
# columns.remove('Scan time total')
# columns.remove('Scan date')
# print(columns)
# for col in columns:
#     print(col)
#     print(Data[col].unique())
#     print(80*'-')    

In [38]:
# # Check voxel sizes (*rounded* to two after-comma values)
# # If different, spit out which values
# roundto = 2
# if len(Data['Voxelsize'].round(roundto).unique()) > 1:
#     print('We scanned all datasets with %s different voxel sizes' % len(Data['Voxelsize'].round(roundto).unique()))
#     for vs in sorted(Data['Voxelsize'].round(roundto).unique()):
#         print('-', vs, 'um for ', end='')
#         for c, row in Data.iterrows():
#             if float(vs) == round(row['Voxelsize'], roundto):
#                 print(os.path.join(row['Fish'], row['Scan']), end=', ')
#         print('')
# else:
#     print('We scanned all datasets with equal voxel size, namely %s um.' % float(Data['Voxelsize'].round(roundto).unique()))

In [39]:
# if len(Data['Grayvalue'].unique()) > 1:
#     print('We reconstructed the datasets with different maximum gray values, namely')
#     for gv in Data['Grayvalue'].unique():
#         print(gv, 'for Samples ', end='')
#         for c, row in Data.iterrows():
#             if float(gv) == row['Grayvalue']:
#                 print(os.path.join(row['Fish'], row['Scan']), end=', ')
#         print('')
# else:
#     print('We reconstructed all datasets with equal maximum gray value, namely %s.' % Data['Grayvalue'].unique()[0])

In [40]:
# Data[['Fish', 'Scan',
#       'Voxelsize', 'Scanner',
#       'Scan date', 'CameraWindow', 'RotationStep', 'Averaging',
#       'Scan time', 'Stacks', 'Scan time total']]

In [41]:
# Get an overview over the total scan time
# Nice output based on https://stackoverflow.com/a/8907407/323100
total_seconds = int(Data['Scan time total'].sum().total_seconds())
hours, remainder = divmod(total_seconds,60*60)
minutes, seconds = divmod(remainder,60)
print('In total, we scanned for %s hours and %s minutes)' % (hours, minutes))
for machine in Data['Scanner'].unique():
    total_seconds = int(Data[Data['Scanner'] == machine]['Scan time total'].sum().total_seconds())
    hours, remainder = divmod(total_seconds,60*60)
    minutes, seconds = divmod(remainder,60)
    print('\t - Of these, we scanned %s hours and %s minutes on the %s,'
          'for %s scans' % (hours,
                            minutes,
                            machine,
                            len(Data[Data['Scanner'] == machine])))

In total, we scanned for 345 hours and 36 minutes)
	 - Of these, we scanned 157 hours and 38 minutes on the SkyScan2214,for 71 scans
	 - Of these, we scanned 187 hours and 57 minutes on the SkyScan1272,for 19 scans


In [42]:
Data[['Fish', 'Scan',
      'Voxelsize', 'Scanner',
      'Scan date', 'CameraWindow', 'RotationStep', 'Averaging', 'Scan time', 'Stacks' ]].to_excel('Details.xlsx')

In [43]:
Data[['Fish', 'Scan',
      'Voxelsize', 'Scanner',
      'Scan date', 'CameraWindow',
      'RotationStep', 'Averaging', 'Scan time', 'Stacks' ]].to_excel(os.path.join(Root,'Details.xlsx'))

In [44]:
# Read Mikkis datafile
MikkisFile = sorted(glob.glob(os.path.join(Root, 'X_ArchiveFiles', '*CTscanFishList.xlsx')))[0]
# Read excel file and use the first column as index
print('Reading in %s' % MikkisFile)
DataMikki = pandas.read_excel(MikkisFile)

Reading in D:\Results\EAWAG\X_ArchiveFiles\02.07.2021_CTscanFishList.xlsx


In [45]:
DataMikki.head()

Unnamed: 0,Fishec,FieldID,OtherID,ReplacementID,Length(cm),TemporaryJar,Genus,Species,Ecology,Scan date,HeadScan,OralJawScan,PharyngealJawScan,OperculumVisible,DataUploaded,QualityChecked,ScanComments,SpecimenReturned,Comments
0,103635,,,,< 7,< 7cm,"""Astatotilapia""",nubila swamp blue,insectivore,2021-02-08T12:25:19,no 20um headscan,yes,yes,no 20um headscan,,,2-3 inner row of tricuspid teeth,,
1,103635,,,,< 7,< 7cm,"""Astatotilapia""",nubila swamp blue,insectivore,2021-02-08T14:24:12,no 20um headscan,yes,yes,no 20um headscan,,,2-3 inner row of tricuspid teeth,,
2,104016,,,,< 7,< 7cm,Enterochromis I,cinctus (St. E),detritivore,2021-02-04T11:21:23,no,yes,not complete,no,,,pharyngeal jaw not complete,,
3,104016,,,,< 7,< 7cm,Enterochromis I,cinctus (St. E),detritivore,2021-02-04T13:30:11,,,,,,,,,
4,14298,,,,< 7,< 7cm,Incertae sedis,thick skin,insectivore,,no,yes,,no,,,bad segmentation quality,,


In [75]:
# Find the fish we look at and display all the info we know about it
# Set a substring you're looking for to the variable below
# In which jar can we find it?
fish = '104061'

In [76]:
# In which jar should it be/go?
foundfishes = 0
for d, row in DataMikki.iterrows():
    if (str(fish).lower() in str(row.Fishec).lower()) or \
    (str(fish).lower() in str(row.FieldID).lower()) or \
    (str(fish).lower() in str(row.OtherID).lower()) or \
    (str(fish).lower() in str(row.ReplacementID).lower()):
        foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
        # remove nan from the list of hits
        foundfishes = [str(x).lower() for x in foundfishes if pandas.isnull(x) == False]
        print('*%s*: The fish ' % fish, end='')        
        if len(foundfishes) > 1:
            for found in foundfishes:
                print(found.upper(), end='/')
        else:
            print(foundfishes[0].upper(), end='')
        print(' should now go in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                              row['TemporaryJar']))
if not foundfishes:
    print('*%s*: Nothing found in %s' % (fish, MikkisFile))

*104061*: The fish 13405/104061/ should now go in jar "length=14.5 cm" (Mark5))


In [77]:
# Do we have something from this fish on disk?
ondisk = glob.glob(os.path.join(Root, '*%s*' % fish))
if len(ondisk):
    for found in ondisk:
        print('*%s*: Found on disk in %s' % (fish, found))
        foundondisk = 1
else:
    print('*%s*: Nothing found in %s' % (fish, Root))
    foundondisk = 0

*104061*: Found on disk in D:\Results\EAWAG\104061


In [78]:
# Did we scan it already?
found = 0
for c, row in Data.iterrows():
    if fish in row.Fish:
        print('*%s*: Sample %s/%s was scanned on %s' % (fish, row['Fish'], row['Scan'], row['Scan date']))
        found = 1
if not found:
    if foundondisk:
        print('*%s*: We have a folder (%s) for this sample, but nothing in the dataframe, so it probably is all good' % (fish, ondisk[0]))
        print('Check the folder to be shure')
    else:
        print('*%s*: Nothing about this sample is found in our dataframe' % fish)

*104061*: We have a folder (D:\Results\EAWAG\104061) for this sample, but nothing in the dataframe, so it probably is all good
Check the folder to be shure


In [79]:
# Can we find it in FullHeadList.txt?
def findinFullHeadList(sample):
    ''' Look for the sample in the FullHeadList.txt file'''
    fullheadlist = glob.glob(os.path.join(Root, 'FullHeadList.*'))[0]    
    found = 0
    with open(fullheadlist, 'r') as f:
        for line in f:
            if str(sample) in line:
                print(line.strip())
                found = 1
    if not found:
        return('*%s*: Nothing found in %s' % (sample, fullheadlist))
    else:
        return(None)
findinFullHeadList(fish)

104061, Labrochromis sp. "stone" (pharyngeal mollusc crusher), head cropped


In [88]:
# Do we need to rescan this fish
# Find all relevant comment files
commentfiles = glob.glob(os.path.join(Root, '*%s*' % fish, '**', '*.md'), recursive=True)
print('We found these comment files in our dataframe')
for c, row in Data.iterrows():
    if fish in row.Fish:
        print('\t-', row.CommentFile)
        found = 1
print(80*'-')
if len(commentfiles):
    for commentfile in commentfiles:
        print('-', commentfile)
        print(10*'-')
        with open(commentfile, 'r', encoding='utf-8') as file:
            for line in file:
                print(line.strip())
                if 'rescan' in line:
                    print('BEEEEP!')
        print(80*'-')

We found these comment files in our dataframe
--------------------------------------------------------------------------------
- D:\Results\EAWAG\104061\104061.rec.md
----------
# Fish 104061, Scan rec

This fish was scanned on 2021-07-15T15:23:55 on the SkyScan2214, with a voxel size of 8.95 μm.

## Comments
--------------------------------------------------------------------------------
- D:\Results\EAWAG\104061\104061.rec_rescan.md
----------
# Fish 104061, Scan rec_rescan
BEEEEP!

This fish was scanned on 2021-08-20T11:06:31 on the SkyScan2214, with a voxel size of 10.0 μm.

## Comments
--------------------------------------------------------------------------------


60 of the fishes need complete head scans.
Let's try to go through Mikkis/Kassandras list and see how far we progressed through that list.

In [61]:
# Read in full head list, go through all the scans we alredy did and see what needs to be done
fullheadlist = glob.glob(os.path.join(Root, '*Head*.txt'))[0]
HeadsToBeScanned = []
with open(fullheadlist, 'r', encoding='utf-8') as file:
    headdone = False
    for ln, line in enumerate(file):
        if line.strip():  #skip empty lines
            # The first 'item' on the line should be the fish ID
            fish = line.strip().split()[0].replace(',','').upper()
            # Let's ignore some lines which don't start with a fish ID
            # The set-join here removes duplicate characters from the string (e.g. =====, !! and ::)
            if len(''.join(set(fish))) > 2:
                for c, row in Data[Data.Fish == fish].iterrows():
                    if 'head' in row.Scan:
                        # print('\t%s has a head-scan' % row.Fish)
                        # print('%s has a head-scan on disk, and is found on line %s of the full head list' % (fish, ln + 1 ))
                        headdone = True
                    else:
                        headdone = False
                # At this point we have either found the fish in the list or 'headdone' is false
                if not headdone:
                    print('%s is missing a head-scan on disk, but is found on line %s of the full head list' % (fish, ln + 1 ))
                    HeadsToBeScanned.append(fish)             

WHERE is missing a head-scan on disk, but is found on line 12 of the full head list
WHERE is missing a head-scan on disk, but is found on line 13 of the full head list
105105 is missing a head-scan on disk, but is found on line 94 of the full head list


In [62]:
# Fish 10448 can be ignored because we did another scan after the head-scan, so we reset "headdone" in the loop above
# We could probably do it in a more clever way, but already spent too much time on this part :)
try:
    HeadsToBeScanned.remove('10448')
    # HeadsToBeScanned.remove('105515')
except ValueError:
    # Nothing to see here, pass along
    pass

In [63]:
for fish in HeadsToBeScanned:
    # In which jar should we look for the fishes we still need to scan the head of?
    foundfishes = 0
    for d, row in DataMikki.iterrows():
        if (str(fish).lower() in str(row.Fishec).lower()) or \
        (str(fish).lower() in str(row.FieldID).lower()) or \
        (str(fish).lower() in str(row.OtherID).lower()) or \
        (str(fish).lower() in str(row.ReplacementID).lower()):
            foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
            # remove nan from the list of hits
            foundfishes = [str(x).lower() for x in foundfishes if pandas.isnull(x) == False]
            print('*%s*: A fish called ' % fish, end='')        
            if len(foundfishes) > 1:
                for found in foundfishes:
                    print(found.upper(), end='/')
            else:
                print(foundfishes[0].upper(), end='')
            print(' should be found in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                                    row['TemporaryJar']))
    if not foundfishes:
        print('*%s*: Nothing found in %s' % (fish, MikkisFile))

*WHERE*: Nothing found in D:\Results\EAWAG\X_ArchiveFiles\02.07.2021_CTscanFishList.xlsx
*WHERE*: Nothing found in D:\Results\EAWAG\X_ArchiveFiles\02.07.2021_CTscanFishList.xlsx
*105105*: A fish called 105105 should be found in jar "length=15 cm" (Mark1))


In [97]:
# Some of the reconstructions need to be looked at?
# Mikki wrote something about this into the files.
# Get a list of *all* comment files
CommentFiles = glob.glob(os.path.join(Root, '**', '*.md'), recursive=True)

- 10448\10448.rec.md - According to Mikki "try re reconstructing to better later segmentation, OJ artifacts, PJ is good", so we 'only' optimize on oral jaw.
- 10628\head_13um\10628.rec.md Needs to be rescanned, according to XLS sheet from Mikki
- 109188\109188.rec.md 2.11.21, DH: The fish was not aligned nicely perpendicular in the sample holder. I re-reconstructed the data *without* a ROI. Parts of the OJ might still be outside of the visible region. @Mikki, can you double-check?
- 11807\11807.rec.md - Mikki and David need to discuss this in detail.
- IG92\IG92.rec.md Needs to be rescanned, according to Mikki not better through re-reconstruction.
- IG96\IG96.rec.md According to Mikki not good, even if re-reconstructed.
- ZuOS148\ZuOS148.rec.md which Mikki will update in the encompassing Excel sheet as ZuOS148. 21.10.2021 DH


In [118]:
# Read what we want
print('Going through all the %s comments files we find' % len(CommentFiles))
for c, cf in enumerate(CommentFiles):
    with open(cf, 'r', encoding='utf-8') as file:
        for line in file:
            if 'Mikki' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root)+1:], line.strip()))
            elif 'ML' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root)+1:], line.strip()))
            elif 'realign' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root)+1:], line.strip()))

Going through all the 239 comments files we find
  0/239: 10151\10151.rec.md: 08.02.2022 - ML - Quality check PNG uploaded to the folder. The artifacts are pretty bad, not sure if re alignment will fix it entirely
 20/239: 103761\103761.rec.md: ML 16.11.2021:  OJ and PJ need rescan at lower voxel where possible.
 23/239: 103767\103767.rec.md: ML 16.11.2021 - Tried re reconstructing, OJ has artifacts connected the jaws. Please rescan just the OJ at lower voxel size if possible.
 38/239: 10448\10448.rec.md: - According to Mikki "try re reconstructing to better later segmentation, OJ artifacts, PJ is good", so we 'only' optimize on oral jaw.
 64/239: 10618\10618.head_rec.md: ML 15.11.2021: Edge of operculum missing in the head scan
 67/239: 10619\10619.rec.md: ML 15.11.2021: Needs a full head scan
 75/239: 10628\head_13um\10628.rec.md: Needs to be rescanned, according to XLS sheet from Mikki
 75/239: 10628\head_13um\10628.rec.md: ML 15.11.2021:  Reconfirming, needs rescanning - for full h

In [112]:
Data.sort_values(['Scan date'], ascending=False, inplace=True)