# Extract all the data of all the scans

Wrestle with all the log files of all the scans.
We double-check all scanning and reconstruction parameters to look for inconsistencies to be corrected.
At the end we generate some helping files which we need for collaboration.

First set up the notebook with some imports and defaults.

In [None]:
# Load the python modules we need
import platform
import os
import glob
import pandas
import imageio
import numpy
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn
import dask
import dask_image.imread
from dask.distributed import Client, LocalCluster
import skimage
from tqdm import notebook

In [None]:
# Load our own log file parsing code
from BrukerSkyScanLogfileRuminator.parsing_functions import *

In [None]:
# Set dask temporary folder
# Do this before creating a client: https://stackoverflow.com/a/62804525/323100
import tempfile
if 'Linux' in platform.system():
    # Check if me mounted the FastSSD, otherwise go to standard tmp file
    if os.path.exists(os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')):
        tmp = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD', 'tmp')
    else:
        tmp = tempfile.gettempdir()
elif 'Darwin' in platform.system():
    tmp = tempfile.gettempdir()
else:
    if 'anaklin' in platform.node():
        tmp = os.path.join('F:\\tmp')
    else:
        tmp = os.path.join('D:\\tmp')
dask.config.set({'temporary_directory': tmp})
print('Dask temporary files go to %s' % dask.config.get('temporary_directory'))

In [None]:
from dask.distributed import Client
client = Client()

In [None]:
client

In [None]:
print('You can seee what DASK is doing at "http://localhost:%s/status"' % client.scheduler_info()['services']['dashboard'])

In [None]:
# Set up figure defaults
plt.rc('image', cmap='gray', interpolation='nearest')  # Display all images in b&w and with 'nearest' interpolation
plt.rcParams['figure.figsize'] = (16, 9)  # Size up figures a bit
plt.rcParams['figure.dpi'] = 200

In [None]:
# Setup scale bar defaults
plt.rcParams['scalebar.location'] = 'lower right'
plt.rcParams['scalebar.frameon'] = False
plt.rcParams['scalebar.color'] = 'white'

Since the (tomographic) data can reside on different drives we set a folder to use below

In [None]:
# Different locations if running either on Linux or Windows
FastSSD = False
nanoct = False  # Load the data directly from the 2214
overthere = True  # Load the data directly from the iee-research_storage drive
# to speed things up significantly
if 'Linux' in platform.system():
    if FastSSD:
        BasePath = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')
    elif overthere:
        BasePath = os.path.join(os.sep, 'home', 'habi', 'research-storage-iee', 'microCT')
    elif nanoct:
        BasePath = os.path.join(os.path.sep, 'home', 'habi', '2214')
    else:
        BasePath = os.path.join(os.sep, 'home', 'habi', '1272')
elif 'Darwin' in platform.system():
    FastSSD = False
    BasePath = os.path.join('/Users/habi/Dev/EAWAG/Data')
elif 'Windows' in platform.system():
    if FastSSD:
        BasePath = os.path.join('F:\\')
    else:
        if overthere:
            BasePath = os.path.join('\\\\resstore.unibe.ch', 'iee_aqua', 'microCTupload')
        elif nanoct:
            BasePath = os.path.join('N:\\')
        else:
            BasePath = os.path.join('D:\\Results')
if overthere:
    Root = BasePath
else:
    Root = os.path.join(BasePath, 'EAWAG')
print('We are loading all the data from %s' % Root)

We generate some output in this notebook.
To make all the data completely reproducible, save the output to a directory named according to the current `git` hash of the repository.

In [None]:
def get_git_hash():
    '''
    Get the current git hash from the repository.
    Based on http://stackoverflow.com/a/949391/323100 and
    http://stackoverflow.com/a/18283905/323100
    '''
    from subprocess import Popen, PIPE
    import os
    gitprocess = Popen(['git',
                        '--git-dir',
                        os.path.join(os.getcwd(), '.git'),
                        'rev-parse',
                        '--short',
                        '--verify',
                        'HEAD'],
                       stdout=PIPE)
    (output, _) = gitprocess.communicate()
    return output.strip().decode("utf-8")

In [None]:
# Make directory for output
OutPutDir = os.path.join(os.getcwd(), 'Output', get_git_hash())
print('We are saving all the output to %s' % OutPutDir)
os.makedirs(OutPutDir, exist_ok=True)

Now that we are set up, actually start to load/ingest the data.

In [None]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [None]:
# Get *all* log files, unsorted but fast
Data['LogFile'] = [os.path.join(root, name)
                   for root, dirs, files in os.walk(Root)
                   for name in files
                   if name.endswith((".log"))]

In [None]:
# # Let's try to optimize the timing, based on https://stackoverflow.com/a/27565420/323100
# n, t = 0, time.time()
# LogFiles = [os.path.join(root, name)
#             for root, dirs, files in os.walk(Root)
#             for name in files
#             if name.endswith((".log"))]
# t = time.time() - t
# print("os.walk: %.4fs, %d files found" % (t, len(LogFiles)))

# n, t = 0, time.time()
# globfiles = [f for f in glob.glob(os.path.join(Root, '**', '*.log'),
#                                   recursive=True)]
# t = time.time() - t
# print("glob.glob, unsorted: %.4fs, %d files found" % (t, len(globfiles)))

# n, t = 0, time.time()
# globfiles = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'),
#                                          recursive=True),
#                                key=os.path.getmtime)]
# t = time.time() - t
# print("glob.glob, sorted: %.4fs, %d files found" % (t, len(globfiles)))

In [None]:
# On the FastSSD, sorted glob is about half as fast as os.walk, and unsorted glob is still substantially slower than walk!
# glob.glob: 0.7773s, 1206 files found
# os.walk: 0.3810s, 1206 files found

The notebook might not be running locally on our machines, but on Binder.
There, the user has no access to the log files, so we fail back to a local copy of them.
This also means that no reconstructions are available, und we thus cannot count them.
We thus set a variable which skips looking for parameters related to the reconstructions.

In [None]:
if not len(Data):
    # Our dataframe is empty.
    # We might be running on Binder, e.g. load the logfiles from the subfolder in this repository
    print(10 * ' -', 'CAVEAT', 10 * ' -')
    print('You are most probably running the notebook on binder.')
    print('And thus do not have access to the log files on the research storage')
    print('We are using a "local" copy of the data in the `logfiles` subfolder')
    print('This gives correct, but possibly outdated results...')
    print(10 * ' -', 'CAVEAT', 10 * ' -')
    # Change root folder
    Root = 'logfiles'
    # Load log files again
    Data['LogFile'] = [f for f in sorted(glob.glob(os.path.join(Root, '**', '*.log'),
                                                   recursive=True),
                                         key=os.path.getmtime)]
    running_on_binder = True
else:
    running_on_binder = False

In [None]:
# Get all folders
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]

In [None]:
if not running_on_binder:
    # Check for samples which are not yet reconstructed
    for c, row in Data.iterrows():
        # Iterate over every 'proj' folder
        if 'proj' in row.Folder:
            if 'TScopy' not in row.Folder and 'PR' not in row.Folder:
                # If there's nothing with 'rec*' on the same level, then tell us
                if not glob.glob(row.Folder.replace('proj', 'rec')):
                    # print(glob.glob(row.Folder.replace('proj', 'rec')))
                    print('- %s is missing matching reconstructions' % row.LogFile[len(Root) + 1:])

In [None]:
# Get rid of all non-rec logfiles
for c, row in Data.iterrows():
    if 'rec' not in row.Folder:
        Data.drop([c], inplace=True)
    elif 'rectmp.log' in row.LogFile:
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# Generate us some meaningful colums
Data['Fish'] = [l[len(Root) + 1:].split(os.sep)[0] for l in Data['LogFile']]
Data['Scan'] = ['.'.join(l[len(Root) + 1:].split(os.sep)[1:-1]) for l in Data['LogFile']]

In [None]:
# # Debug output
# for log in Data['LogFile']:
#     try:
#         (pixelsize(log))
#     except:
#         print(log)

In [None]:
# Get parameters related to scan from logfiles
Data['Voxelsize'] = [pixelsize(log) for log in Data['LogFile']]
Data['Voltage'] = [voltage(log) for log in Data['LogFile']]
Data['Current'] = [current(log) for log in Data['LogFile']]
Data['Filter'] = [whichfilter(log) for log in Data['LogFile']]
Data['Exposuretime'] = [exposuretime(log) for log in Data['LogFile']]
Data['Scanner'] = [scanner(log) for log in Data['LogFile']]
Data['Averaging'] = [averaging(log) for log in Data['LogFile']]
Data['Scan date'] = [scandate(log) for log in Data['LogFile']]
Data['ProjectionSize'] = [projection_size(log) for log in Data['LogFile']]
Data['RotationStep'] = [rotationstep(log) for log in Data['LogFile']]
Data['ThreeSixty'] = [threesixtyscan(log) for log in Data['LogFile']]

In [None]:
# Get parameters related to reconstruction from logfiles
Data['ReconstructionSize'] = [reconstruction_size(log) for log in Data['LogFile']]
Data['Grayvalue'] = [reconstruction_grayvalue(log) for log in Data['LogFile']]
Data['RingartefactCorrection'] = [ringremoval(log) for log in Data['LogFile']]
Data['BeamHardeningCorrection'] = [beamhardening(log) for log in Data['LogFile']]
Data['ROI'] = [region_of_interest(log) for log in Data['LogFile']]
Data['Duration'] = [duration(log) for log in Data['LogFile']]
Data['Stacks'] = [stacks(log) for log in Data['LogFile']]

In [None]:
# The iee research storage folder contains some folders with scans done by Kassandra on a SkyScan1273.
# Exclude those, since they are not part of this study, we just looked at them to help her.
for c, row in Data.iterrows():
    if '1273' in row.Scanner:
        print('Dropping %s from our dataframe' % row.LogFile[len(Root)+1:])
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# The iee research storage folder contains folders with scans of only teeth, done as a small pilot study.
# Exclude those, since they are not part of this study.
for c, row in Data.iterrows():
    if 'Teeth' in row.Folder:
        print('Dropping %s from our dataframe' % row.LogFile[len(Root)+1:])
        Data.drop([c], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# Sort dataframe on fishes and scans
Data.sort_values(by=['Fish', 'Scan'], inplace=True)
# Reset dataframe index
Data = Data.reset_index(drop=True)

In [None]:
# How many fishes did we scan?
# We scanned six 'BucketOfFish' so subtract those :)
print('We have %s unique names in our corpus of scans' % (len(Data.Fish.unique()) - 6))
print('We performed %s scans in total' % len(Data.Scan))

In [None]:
# Get the file names of the reconstructions
Data['Reconstructions'] = [[os.path.join(root, name)
                            for root, dirs, files in os.walk(f)
                            for name in files
                            if 'rec0' in name and name.endswith((".png"))] for f in Data['Folder']]
# Count how many files we have
Data['Number of reconstructions'] = [len(r) for r in Data.Reconstructions]

In [None]:
if not running_on_binder:
    # Let's see if we're missing some data
    for c, row in Data[Data['Number of reconstructions'] == 0].iterrows():
        print('%s/%s: Folder %s does not contain any reconstructions and '
              'will be removed in the next step' % (c + 1,
                                                    len(Data),
                                                    os.path.join(row.Fish, row.Scan)))

In [None]:
# 103761/rec_rereconstruct is a folder where we tried to salvage a scan where the sample holder touched the source
# MA31/moved_rec/ is a folder where the fish moved during the acquisition
# MA31/stuck_rec/ is a folder where we've lost air pressure in the building and the stage got stuck

In [None]:
print('We have %s folders in total' % (len(Data)))
if not running_on_binder:
    # Drop samples which have not been reconstructed yet
    # Based on https://stackoverflow.com/a/13851602
    # for c, row in Data.iterrows():
    #     if not row['Number of reconstructions']:
    #         print('%s contains no PNG files, we might be currently reconstructing it' % row.Folder)
    Data = Data[Data['Number of reconstructions'] > 0]
    Data.reset_index(drop=True, inplace=True)
    print('Of which %s folders do contain reconstructions' % (len(Data)))

In [None]:
Data['Total Duration'] = [st * stk for st, stk in zip(Data['Duration'], Data['Stacks'])]

In [None]:
# Show five smallest voxelsizes and scans
for c, vs in enumerate(sorted(Data.Voxelsize.unique())[:5]):
    print('-----vs: %s-----' % vs)
    print(Data[Data.Voxelsize == vs][['Fish', 'Scan', 'Voxelsize']])

In [None]:
# Show five largest voxelsizes and scans
for c, vs in enumerate(sorted(Data.Voxelsize.unique())[-5:]):
    print('-----vs: %s-----' % vs)
    print(Data[Data.Voxelsize == vs][['Fish', 'Scan', 'Voxelsize']])

In [None]:
Data.Filter.unique()

In [None]:
sorted(Data.Voltage.unique())

In [None]:
sorted(Data.Current.unique())

In [None]:
sorted(Data.RingartefactCorrection.unique())

In [None]:
sorted(Data.BeamHardeningCorrection.unique())

In [None]:
# Generate a text file in each rec-folder, in which Mikki and I can note what's going on with the fish
# Generate filename
for c, row in Data.iterrows():
    Data.at[c, 'CommentFile'] = os.path.join(os.path.dirname(row.Folder),
                                             row.Fish + '.' + row.Scan + '.md')
# Create actual file on disk
for c, row in Data.iterrows():
    # Only do this if the file does not already exist
    if not os.path.exists(row.CommentFile):
        with open(row.CommentFile, 'w', encoding='utf-8') as f:
            f.write('# Fish %s, Scan %s\n\n' % (row.Fish, row.Scan))
            f.write('This fish was scanned on %s on the %s, with a voxel size of %s μm.\n\n'
                    % (row['Scan date'], row.Scanner, numpy.round(row.Voxelsize, 2)))
            f.write('## Comments')

In [None]:
# Get an overview of the total scaning time
# Nice output based on https://stackoverflow.com/a/8907407/323100
total_seconds = int(Data['Total Duration'].sum().total_seconds())
hours, remainder = divmod(total_seconds, 60 * 60)
minutes, seconds = divmod(remainder, 60)
print('In total, we scanned for %s hours and %s minutes)' % (hours, minutes))
for machine in Data['Scanner'].unique():
    total_seconds = int(Data[Data['Scanner'] == machine]['Scan time total'].sum().total_seconds())
    hours, remainder = divmod(total_seconds, 60 * 60)
    minutes, seconds = divmod(remainder, 60)
    print('\t - Of these, we scanned %s hours and %s minutes on the %s,'
          ' for %s scans' % (hours,
                             minutes,
                             machine,
                             len(Data[Data['Scanner'] == machine])))

In [None]:
# We scanned six 'buckets of fish', so subtract those :)
print('We scanned %0.f fishes' % (len(Data.Fish.unique()) - 6))

In [None]:
print('We did a total of %s scans' % len(Data))

In [None]:
print('We perfomed %s scans with "head" in their folder name' % len(Data[Data['Scan'].str.contains('head')]))

In [None]:
Data[['Fish', 'Scan', 'LogFile',
      'Voxelsize', 'Scanner',
      'Scan date', 'ProjectionSize',
      'ThreeSixty', 'RotationStep', 'Averaging',
      'Duration', 'Stacks', 'Total Duration']].to_excel(os.path.join(OutPutDir, 'Details.xlsx'))
print('Saved XLS sheet with some scanning details to', os.path.join(OutPutDir, 'Details.xlsx'))

In [None]:
if not running_on_binder:
    Data[['Fish', 'Scan',
          'Voxelsize', 'Scanner',
          'Scan date', 'ProjectionSize',
          'ThreeSixty', 'RotationStep', 'Averaging',
          'Duration', 'Stacks', 'Total Duration']].to_excel(os.path.join(Root, 'Details.xlsx'))
print('Saved XLS sheet with some scanning details to', os.path.join(Root, 'Details.xlsx'))

In [None]:
# Save 'data' file for manuscript: github.com/habi/eawag-manuscript
# Since the manuscript is in a subfolder, we can simply write the output there
if not running_on_binder:
    Data[['Fish', 'Scan', 'Scanner', 'Scan date',
          'Voxelsize', 'Voltage', 'Current', 'Filter', 'Exposuretime', 'Averaging',
          'ThreeSixty', 'RotationStep', 'ProjectionSize', 'Duration', 'Stacks', 'Total Duration',
          'RingartefactCorrection', 'BeamHardeningCorrection', 'Grayvalue',
          ]].to_csv(os.path.join('manuscript', 'content', 'data', 'ScanningDetails.csv'),
               index=False,
               header=['Fish', 'Scan', 'Scanner', 'Scan date',
                       'Voxelsize [μm]', 'Source voltage [kV]', 'Source current [μA]',
                       'Filter', 'Exposure time [ms]', 'Frame averaging', '360° scan', 
                       'Rotation step [°]', 'Projection size', 'Scan duration [s]', 'Stacked scans', 'Total scan duration [s]',
                       'Ring removal correction', 'Beam hardening correction', 'Gray value mapping'])
print('Saved CSV file with all relevant scanning and reconstruction parameters to',
      os.path.join('manuscript', 'content', 'data', 'ScanningDetails.csv'),
      'for using as supplementary material in the manuscript')

In [None]:
if not running_on_binder:
    # Read Mikkis datafile
    MikkisFile = sorted(glob.glob(os.path.join(Root, 'X_ArchiveFiles', '*CTscanFishList.xlsx')))[0]
    # Read excel file and use the first column as index
    print('Reading in %s' % MikkisFile)
    DataMikki = pandas.read_excel(MikkisFile)

In [None]:
if not running_on_binder:
    DataMikki.head()

In [None]:
DataMikki

In [None]:
# Find the fish we look at and display all the info we know about it
# Set a substring you're looking for to the variable below
# In which jar can we find it?
fish = '104061'

In [None]:
if not running_on_binder:
    # In which jar should it be/go?
    foundfishes = 0
    for d, row in DataMikki.iterrows():
        if (str(fish).lower() in str(row.Fishec).lower()) \
        or (str(fish).lower() in str(row.FieldID).lower()) \
        or (str(fish).lower() in str(row.OtherID).lower()) \
        or (str(fish).lower() in str(row.ReplacementID).lower()):
            foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
            # remove nan from the list of hits
            foundfishes = [str(x).lower() for x in foundfishes if not pandas.isnull(x)]
            print('*%s*: The fish ' % fish, end='')
            if len(foundfishes) > 1:
                for found in foundfishes:
                    print(found.upper(), end='/')
            else:
                print(foundfishes[0].upper(), end='')
            print(' should now go in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                                  row['TemporaryJar']))
    if not foundfishes:
        print('*%s*: Nothing found in %s' % (fish, MikkisFile))

In [None]:
if not running_on_binder:
    # Do we have something from this fish on disk?
    ondisk = glob.glob(os.path.join(Root, '*%s*' % fish))
    if len(ondisk):
        for found in ondisk:
            print('*%s*: Found on disk in %s' % (fish, found))
            foundondisk = 1
    else:
        print('*%s*: Nothing found in %s' % (fish, Root))
        foundondisk = 0

In [None]:
if not running_on_binder:
    # Did we scan it already?
    found = 0
    for c, row in Data.iterrows():
        if fish in row.Fish:
            print('*%s*: Sample %s/%s was scanned on %s' % (fish, row['Fish'], row['Scan'], row['Scan date']))
            found = 1
    if not found:
        if foundondisk:
            print('*%s*: We have a folder (%s) for this sample, but nothing in the dataframe, so it probably is all good' % (fish, ondisk[0]))
            print('Check the folder to be shure')
        else:
            print('*%s*: Nothing about this sample is found in our dataframe' % fish)

In [None]:
# Can we find it in FullHeadList.txt?
def findinFullHeadList(sample):
    ''' Look for the sample in the FullHeadList.txt file'''
    fullheadlist = glob.glob(os.path.join(Root, 'FullHeadList.*'))[0]
    found = 0
    with open(fullheadlist, 'r') as f:
        for line in f:
            if str(sample) in line:
                print(line.strip())
                found = 1
    if not found:
        return('*%s*: Nothing found in %s' % (sample, fullheadlist))
    else:
        return(None)

In [None]:
if not running_on_binder:
    findinFullHeadList(fish)

In [None]:
if not running_on_binder:
    # Do we need to rescan this fish
    # Find all relevant comment files
    commentfiles = glob.glob(os.path.join(Root, '*%s*' % fish, '**', '*.md'), recursive=True)
    print('We found these comment files in our dataframe')
    for c, row in Data.iterrows():
        if fish in row.Fish:
            print('\t-', row.CommentFile)
            found = 1
    print(80 * '-')
    if len(commentfiles):
        for commentfile in commentfiles:
            print('-', commentfile)
            print(10 * '-')
            with open(commentfile, 'r', encoding='utf-8') as file:
                for line in file:
                    print(line.strip())
                    if 'rescan' in line:
                        print('BEEEEP!')
            print(80 * '-')

60 of the fishes need complete head scans.
Let's try to go through Mikkis/Kassandras list and see how far we progressed through that list.

In [None]:
if not running_on_binder:
    # Read in full head list, go through all the scans we alredy did and see what needs to be done
    fullheadlist = glob.glob(os.path.join(Root, '*Head*.txt'))[0]
    HeadsToBeScanned = []
    with open(fullheadlist, 'r', encoding='utf-8') as file:
        headdone = False
        for ln, line in enumerate(file):
            if line.strip():  # skip empty lines
                # The first 'item' on the line should be the fish ID
                fish = line.strip().split()[0].replace(',', '').upper()
                # Let's ignore some lines which don't start with a fish ID
                # The set-join here removes duplicate characters from the string (e.g. =====, !! and ::)
                if len(''.join(set(fish))) > 2:
                    for c, row in Data[Data.Fish == fish].iterrows():
                        if 'head' in row.Scan:
                            # print('\t%s has a head-scan' % row.Fish)
                            # print('%s has a head-scan on disk, and is found on line %s of the full head list' % (fish, ln + 1))
                            headdone = True
                        else:
                            headdone = False
                    # At this point we have either found the fish in the list or 'headdone' is false
                    if not headdone:
                        print('%s is missing a head-scan on disk, but is found on line %s of the full head list' % (fish, ln + 1))
                        HeadsToBeScanned.append(fish)

In [None]:
if not running_on_binder:
    # Fish 10448 can be ignored because we did another scan after the head-scan, so we reset "headdone" in the loop above
    # We could probably do it in a more clever way, but already spent too much time on this part :)
    try:
        HeadsToBeScanned.remove('10448')
        # HeadsToBeScanned.remove('105515')
    except ValueError:
        # Nothing to see here, pass along
        pass

In [None]:
if not running_on_binder:
    for fish in HeadsToBeScanned:
        # In which jar should we look for the fishes we still need to scan the head of?
        foundfishes = 0
        for d, row in DataMikki.iterrows():
            if (str(fish).lower() in str(row.Fishec).lower()) \
            or (str(fish).lower() in str(row.FieldID).lower()) \
            or (str(fish).lower() in str(row.OtherID).lower()) \
            or (str(fish).lower() in str(row.ReplacementID).lower()):
                foundfishes = (row.Fishec, row.FieldID, row.OtherID, row.ReplacementID)
                # remove nan from the list of hits
                foundfishes = [str(x).lower() for x in foundfishes if not pandas.isnull(x)]
                print('*%s*: A fish called ' % fish, end='')
                if len(foundfishes) > 1:
                    for found in foundfishes:
                        print(found.upper(), end='/')
                else:
                    print(foundfishes[0].upper(), end='')
                print(' should be found in jar "length=%s cm" (%s))' % (row['Length(cm)'],
                                                                        row['TemporaryJar']))
        if not foundfishes:
            print('*%s*: Nothing found in %s' % (fish, MikkisFile))

In [None]:
# Some of the reconstructions need to be looked at?
# Mikki wrote something about this into the files.
# Get a list of *all* comment files
CommentFiles = glob.glob(os.path.join(Root, '**', '*.md'), recursive=True)

In [None]:
# Read what we want
print('Going through all the %s comments files we find' % len(CommentFiles))
for c, cf in enumerate(CommentFiles):
    with open(cf, 'r', encoding='utf-8') as file:
        for line in file:
            if 'Mikki' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))
            elif 'ML' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))
            elif 'realign' in line:
                print('%03s/%s: %s: %s' % (c, len(CommentFiles), cf[len(Root) + 1:], line.strip()))

In [None]:
Data.sort_values(['Scan date'], ascending=False, inplace=True)