In [1]:
import platform
import os
import glob
import fnmatch
import pandas
from tqdm import notebook

In [2]:
platform.platform()

'Windows-7-6.1.7601-SP1'

All the data is on `\\resstore.unibe.ch\ana_rs_myct\`

In [3]:
if 'LINUX' in platform.platform():
    # LINUX
    ResStore = os.path.join(os.sep, 'home', 'habi', 'resstorage')
elif 'Windows' in platform.platform():
    ResStore = os.path.join('R://')
elif 'Darwin' in platform.platform():
    ResStore = os.path.join('/Volumes/ana_rs_myct', 'data')
print('We are loading all the data from %s' % ResStore)

We are loading all the data from R://


In [4]:
if os.path.isfile('alllogfiles.txt'):
    # Read previously saved logfiles from disk
    print('Reading previously saved file with a list of all found logfiles')
    with open('alllogfiles.txt', 'r') as f:
        # https://stackoverflow.com/a/15233379/323100
        alllogfiles = f.read().splitlines()
else:
    # Get *All* logfiles from disk
    # https://stackoverflow.com/a/14798263/323100
    print('Generating a list of all found logfiles found on the research storage')
    # This takes at least half an hour..
    import datetime
    print('Started looking at %s' % datetime.datetime.now())
    alllogfiles = glob.glob(os.path.join(ResStore, 'Archiv_Tape', '*', '**', '*.log'), recursive=True)
    print('Finished looking at %s' % datetime.datetime.now())
    # Save to disk for next use
    with open('alllogfiles.txt', "w") as output:
        for row in alllogfiles:
            output.write(str(row) + '\n')

Reading previously saved file with a list of all found logfiles


In [5]:
#for l in alllogfiles:
#    print(l)

In [6]:
# len(alllogfiles)

In [7]:
logfiles = alllogfiles.copy()

In [8]:
print('We found %s logfiles' % len(logfiles))
# Remove all the ones that we don't want
# Do NOT do this in a loop, otherwise this does not work correctly
findstring = ['*_rec.log', '*_rectmp.log', '*_rec_???_.log', '*_rec_???.log', '*_rec_???_voi_.log',
              '*voi__voi_.log',  '_voi_.*.log', '*.ctan.log', '*.batman.log',
              '*\\STEPanizerizer.log', '*\\settings.log', '*\\example.log',
              '*\\rec\\*', '*\\VOI\\*']
for what in findstring:
    # Use the fnmatch library to filter out unnneded log files
    logfiles = [l for l in logfiles if not fnmatch.fnmatch(l, what)]
print('After removing all')
for what in findstring:
    print("\t- '%s'" % what)
print('files, we have %s logfiles left to work with' % len(logfiles))

We found 5035 logfiles
After removing all
	- '*_rec.log'
	- '*_rectmp.log'
	- '*_rec_???_.log'
	- '*_rec_???.log'
	- '*_rec_???_voi_.log'
	- '*voi__voi_.log'
	- '_voi_.*.log'
	- '*.ctan.log'
	- '*.batman.log'
	- '*\STEPanizerizer.log'
	- '*\settings.log'
	- '*\example.log'
	- '*\rec\*'
	- '*\VOI\*'
files, we have 3051 logfiles left to work with


In [9]:
# Check if there are some 'rec' left...
for f in logfiles:
    if 'rec' in f:
        print(f)

R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC02\proj\rec02~00.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC02\proj\rec02.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC02\proj\rec02~02.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC02\proj\rec02~01.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC03\proj\rec03.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC03\proj\rec03~01.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC03\proj\rec03~00.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC03\proj\rec03~02.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC01\proj\rec01~01.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC01\proj\rec01~00.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC01\proj\rec01~02.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Fishes\REC01\proj\rec01.log
R://Archiv_Tape\Zebra-Fish_Ines\OrganVolume\Thorax\REC02\proj\rec02~00.log
R://Archiv_Tape\Zebra-Fish_Ines\Or

In [10]:
def get_numberofprojectionsfromlog(logfile, verbose=False):
    """Get the number of projections from the logfile"""
    with open(logfile, 'r') as f:
        if verbose:
            print('In %s we found that there should be' % logfile, end=' ')
        for line in f:
            # Sometimes Bruker writes 'of Files', sometimes 'Of Files'...
            if 'Number' and 'f Files' in line:
                numrec = int(line.split('=')[1])
        if verbose:
            print('%s projections' % numrec)
    try:
        return(numrec)
    except:
        print('No "number of files" found in %s' % logfile)
        return()

In [11]:
def get_numberofprojectionsfromdir(logfile, verbose=False):
    """
    Get the number of projections from the proj folder.
    Only look for TIFF files with 'prefix'+NUMERAL+.tif.
    Otherwise we also get the '_arc*.tif' and '_pp*.tif' files.
    We get the prefix from the logfile.
    """
    import re  # for regex searching
    if verbose:
        print('Based on %s' % logfile)
        print('Checking number of TIF files in %s' % os.path.dirname(logfile))
    with open(logfile, 'r') as f:
        prefix = []
        for line in f:
            # Get the prefix of the image files, which is handy for oversize scans
            if 'Filen' in line and 'fix' in line:
                prefix = line.split('=')[1].strip()
    if not prefix:
        print('No "prefix" found in %s' % logfile)
        return()
    if verbose:
        print('We are only looking for files with %sNUMERALS.tif' % prefix)  
    # ALL tiffiles, but only with with 'prefix' + numerals + .tif, hence exclding prefix_pp*.tif and prefix_ar*.tif
    # regex search based on https://stackoverflow.com/a/55810892/323100 and regexr.com/51lob
    tiffiles = [f for f in os.listdir(os.path.dirname(logfile)) if re.search(rf'{prefix}([0-9])*\.tif', f)]
    if verbose:
        print('And found %s such files' % len(tiffiles))
    try:
        return(len(tiffiles))
    except:
        print('Based on %s We were not able to find any tif files.' % logfile)
        return()

In [12]:
def get_listofprojections(logfile, verbose=False):
    """Get the actual projection names."""
    import re  # for regex searching
    if verbose:
        print('Based on %s' % logfile)
        print('Checking number of TIF files in %s' % os.path.dirname(logfile))
    with open(logfile, 'r') as f:
        prefix = []
        for line in f:
            # Get the prefix of the image files, which is handy for oversize scans
            if 'Filename' in line and 'fix' in line:
                prefix = line.split('=')[1].strip()
    if not prefix:
        print('No "prefix" found in %s' % logfile)
        return()
    if verbose:
        print('We are only looking for files with %sNUMERALS.tif' % prefix)            
    # ALL tiffiles, but only with with 'prefix' + numerals + .tif, hence exclding prefix_pp*.tif and prefix_ar*.tif
    # regex search based on https://stackoverflow.com/a/55810892/323100 and regexr.com/51lob
    tiffiles = [os.path.join(os.path.dirname(logfile), f) for f in os.listdir(os.path.dirname(logfile)) if re.search(rf'{prefix}([0-9])*\.tif', f)]
    if verbose:
        print('And found %s such files' % len(tiffiles))            
    try:
        return(tiffiles)
    except:
        print('Based on %s We were not able to find any tif files.' % logfile)
        return()

In [13]:
#for l in logfiles:
#    print(l)
#    print(get_numberofprojectionsfromdir(l),
#          get_numberofprojectionsfromlog(l))

In [14]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [15]:
# Put the log files from above into the dataframe
Data['LogFile'] = logfiles

In [16]:
# Generate folder names
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]
Data['LastFolder'] = [os.path.basename(f) for f in Data['Folder']]

In [17]:
#for f in Data['LogFile']:
#    get_numberofprojectionsfromlog(f)

In [18]:
# TEMPORÄR
# logfiles = [l for l in logfiles if not fnmatch.fnmatch(l, '*Ganz_Mumifiziert_18um_Cu0*')]

In [19]:
# Get the number of projections as stated in the directory
Data['NumProjFromLog'] = [get_numberofprojectionsfromlog(f) for f in Data['LogFile']]
#for f in logfiles:
#    get_numberofprojectionsfromlog(f, verbose=False)

In [20]:
# Get the number of projections that are present in the directory
Data['NumProjFromDirectory'] = [get_numberofprojectionsfromdir(f) for f in Data['LogFile']]
#for f in logfiles:
#    get_listofprojections(f, verbose=False)

In [21]:
def get_machine(logfile, verbose=False):
    """Get the machine name from the logfile"""
    with open(logfile, 'r') as f:
        if verbose:
            print('In %s we found that the scan was done on' % logfile, end=' ')
        for line in f:
            if 'Scanner=' in line:
                machine = str(line.split('=')[1]).strip()
        if verbose:
            print('the %s' % machine)
    try:
        return(machine)
    except:
        print('No "Scanner=" found in %s' % logfile)
        return()

In [22]:
# It seems that for (most of the) scans with the 1172 the NumProj in the log file is one more than the files on disk.
# Let's see if that is the case by extracting the machnie from the logfile and checking if NumProjDir+1 is the same as NumProjLog
Data['Machine'] = [get_machine(f) for f in Data['LogFile']]

In [23]:
def get_scandate(logfile, verbose=False):
    """Get the time and date of the scan from the logfile"""
    with open(logfile, 'r') as f:
        if verbose:
            print('In %s we found that the scan was done on' % logfile, end=' ')
        for line in f:
            if 'Study Date and Time' in line:
                try:
                    #print(line.split('=')[1].strip())
                    scandate = pandas.to_datetime(line.split('=')[1])
                except:
                    # Some time the time is written with HHh:MMm:SSs.
                    # We get rid of the denominators with some string splitting and replacing
                    #print(line.split('=')[1].strip('s\n').replace('m:',':').replace('h:',':'))
                    scandate = pandas.to_datetime(line.split('=')[1].strip('s\n').replace('m:',':').replace('h:',':'))
        if verbose:
            print('%s' % scandate)
    try:
        return(scandate)
    except:
        print('No "Study Date and Time=" found in %s' % logfile)
        return()

In [24]:
# Get the date of the scan
Data['ScanDate'] = [get_scandate(f) for f in Data['LogFile']]

In [25]:
# See if the number of projection in the log file is the same as the one on disk
Data['SameSame'] = [l == d for l,d in zip(Data['NumProjFromLog'], Data['NumProjFromDirectory'])]

In [26]:
set(Data.SameSame)

{False, True}

In [27]:
# Calculate the difference
Data['Difference'] = [abs(d-l) for d,l in zip(Data['NumProjFromDirectory'],Data['NumProjFromLog'])]

In [28]:
# Difference-check, the old way
# if Data['SameSame'].all():
#     print('All %s found logfiles say that we have as many TIF files as found in their respective folder' % len(Data))
# else:
#     Data['SameSamePlusMinusOne'] = ''
#     for c, row in Data.iterrows():
#         if row.SameSame:
#             Data.at[c,'SameSamePlusMinusOne'] = 'Sa'
#         else:
#             if row.NumProjFromDirectory + 1 == row.NumProjFromLog:
#                 # For the 1172, Bruker sometimes states in the logfile that there's a file more on disk than what we find
#                 # print(row.Machine)
#                 if '1172' in row.Machine:
#                     Data.at[c,'SameSamePlusMinusOne'] = 'Plus'
#                     pass
#                 else:
#                     print('Check %s. The Logfile says we should have %s files, we found %s files' % (row.LogFile,
#                                                                                                  row.NumProjFromLog,
#                                                                                                  row.NumProjFromDirectory))
#             elif row.NumProjFromDirectory - 1 == row.NumProjFromLog:
#                 # For the 1272, Bruker sometimes states in the logfile that there's a file less on disk than what we find
#                 # print(row.Machine)
#                 if '1272' in row.Machine:
#                     Data.at[c,'SameSamePlusMinusOne'] = 'Minus'
#                     pass
#                 else:
#                     print('Check %s. The Logfile says we should have %s files, we found %s files' % (row.LogFile,
#                                                                                                  row.NumProjFromLog,
#                                                                                                  row.NumProjFromDirectory))
#             elif row.NumProjFromDirectory == 0:
#                 pass
#                 #print('Check %s. We found %s files' % (row.LogFile, row.NumProjFromDirectory))
#             else:
#                 pass
#                 #print('Check %s. The Logfile says we should have %s files, we found %s files' % (row.LogFile,
#                 #                                                                                 row.NumProjFromLog,
#                 #                                                                                 row.NumProjFromDirectory))

Check if tif files are correct: http://www.blog.pythonlibrary.org/2020/02/09/how-to-check-if-a-file-is-a-valid-image-with-python/

In [29]:
def checkTIFfile(listoftiffs, subset=True, part=50, verbose=False):
    import imghdr
    import imageio
    import numpy
    howmany = len(listoftiffs)
    if not howmany:
        # There are no files
        return(False)
    if subset:
        # Get a random subset (part chooses it size) of the original, but at least one file (e.g. 1 + numpy.ceil)
        listoftiffs = numpy.random.choice(listoftiffs, size=int(1 + numpy.ceil(len(listoftiffs)/part)))
    allgood = [False] * len(listoftiffs)
    #for c, f in notebook.tqdm(enumerate(listoftiffs), total=len(listoftiffs), desc='Checking %s of %s' % (len(listoftiffs), howmany)):
    for c, f in enumerate(listoftiffs):
        img = imageio.imread(f)
        if verbose:
            print('%s is a %s file with size of %s and a mean of %0.2f' % (os.path.basename(f),
                                                                           imghdr.what(f),
                                                                           numpy.shape(img),
                                                                           numpy.mean(img)))
        # Consistencycheck: Is it a TIFF? AND Does it have a dimension? AND Is there some information in the image?
        if imghdr.what(f) == 'tiff' and  numpy.shape(img)[0] > 0 and numpy.mean(img) > 0:
            allgood[c] = True
    # Get unique values with list(set(our_list))
    return(list(listoftiffs), allgood)

In [30]:
#for c,row in Data.iterrows():
#    print('Checking %s' % row.Folder)
#    checkTIFfile(row.Projections,part=500)

In [31]:
# checkTIFfile(get_listofprojections(Data['LogFile'][3]), part=1111)

In [32]:
# Not as a list comprehension, but so that we get a progress bar
Data['CheckedTIFFs'] = None
Data['CheckedGood'] = None
for c, row in notebook.tqdm(Data.iterrows(),
                            total=len(Data),
                            desc='Checking TIFFs',):
    try:
        checkedlist, status = checkTIFfile(get_listofprojections(row['LogFile']), part=100)
        Data.at[c,'CheckedTIFFs'] = checkedlist
        Data.at[c,'CheckedGood'] = status
    except:
        print('Cannot check %s, something is wrong' % row.Folder)

HBox(children=(FloatProgress(value=0.0, description='Checking TIFFs', max=3051.0, style=ProgressStyle(descript…

Cannot check R://Archiv_Tape\Neda\control01_lumbar_1_32um_50kv_nof\proj, something is wrong
Cannot check R://Archiv_Tape\Neda\Control01\lumbar\proj, something is wrong
Cannot check R://Archiv_Tape\Neda\Control01\cervical\proj, something is wrong
Cannot check R://Archiv_Tape\Neda\Control01\cervical\proj, something is wrong
Cannot check R://Archiv_Tape\Rat_Wistar_5wk\Head_Whole_10um\proj, something is wrong
Cannot check R://Archiv_Tape\Lung Smoke\Lung-Smoke_4mth_003-354_4um\proj, something is wrong
Cannot check R://Archiv_Tape\Dental-Implant\Dental-Implant\Dental-Implant_15-007_TO-LP_10um\proj\test, something is wrong
Cannot check R://Archiv_Tape\Helsinki\Helsinki\RM54_2_7um_cu\proj, something is wrong
Cannot check R://Archiv_Tape\Helsinki\Helsinki\overview\MR3267_11_6um_cu\proj, something is wrong
Cannot check R://Archiv_Tape\Helsinki\Helsinki\RM125_2_7um_cu\proj, something is wrong
Cannot check R://Archiv_Tape\Helsinki\Helsinki\RM91_2_7um_cu\proj, something is wrong
Cannot check R://Ar

Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B67-Overview-4-18um\Proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B61-MRT_D24-T24_4-4um\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B51-BB_D16-T6\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B23_MRT_D24-T14_Overview_4-6um\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B07-CTRL_D16-T6\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B68-BB_D24-T14_4-3um\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B54_mrt_D16T06\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B54_mrt_D16T06\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B54_mrt_D16T06\proj, something is wrong
Cannot check R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\B54_mrt_D16T06\p

Cannot check R://Archiv_Tape\Validation-Hindlimb\M07\M7-Lump-0.99um\Proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M07\M7_Plant_1.72um\Proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M07\M7-L-M-TA\M7-left-TA-overview_2-2um\Proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M07\M7-Hindlimb-L_2-8um_\proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M07\Plantaris_Fixed_5um\Proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M13\M13_hindlimb_Overview_Right_2_58um\Proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M13\M13-R-LVEGF_Overview_2-7um\proj, something is wrong
Cannot check R://Archiv_Tape\Validation-Hindlimb\M13\M13-Hindlimb-L-Overview-3-45um\Proj, something is wrong
Cannot check R://Archiv_Tape\SCIP-Flap\SCIP-Flap-Lappen05.1_Scan1\proj, something is wrong
Cannot check R://Archiv_Tape\SCIP-Flap\SCIP-Flap-Lappen05.1_Scan1\proj, something is

Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N120\N120_RightKidney_Erlangen_2-2um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N118\N118_LeftKidney_Erlangen_2-4um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N118\N118_RightKidney_Erlangen_2-19um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N119\N119_LeftKidney_Erlangen_2-6um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N119\N119_RightKidney_Erlangen_2-06um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N114\N114_LeftKidney_Erlangen_2-6um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N114\N114_RigthKidney_Erlangen_2-06um\proj, something is wrong
Cannot check R://Archiv_Tape\Kidney\Kidney\Kidney-Erlangen-2015\N115\N115_LeftKidney_Erlangen_1-9um\proj, something is

Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-190_211\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-190_211\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-185_186\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-185_186\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-239_TG-MI\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-221_TG-MI\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-185_WT-MI\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-191_WT-CTRL\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-167_WT-CTRL\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\Heart_Finland\RKB-219_TG-MI\proj, something is wrong
Cannot check R://Archiv_Tape\Heart_Finland\H

Cannot check R://Archiv_Tape\Straumann\Rosli_Jaw_Jul26_Al1mm_Cu02mm_100kV_15um\proj\Corrected, something is wrong
Cannot check R://Archiv_Tape\Straumann\228531R_Jaw_Cu1mm_18um_360_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\228531R_Jaw_Cu1mm_18um_360_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\229919L\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9um_OFFSET_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9um_OFFSET_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9um_OFFSET_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9um_OFFSET_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9um_OFFSET_\proj, something is wrong
Cannot check R://Archiv_Tape\Straumann\Vreni_Jaw_Jul19_AL1mm-Cu02mm_100kV_9

In [33]:
Data.sample(n=5)

Unnamed: 0,LogFile,Folder,LastFolder,NumProjFromLog,NumProjFromDirectory,Machine,ScanDate,SameSame,Difference,CheckedTIFFs,CheckedGood
2464,R://Archiv_Tape\Lung Fibrosis\M242_14_RUL\proj...,R://Archiv_Tape\Lung Fibrosis\M242_14_RUL\proj,proj,1895,1895,SkyScan1272,2017-03-01 16:43:13,True,0,[R://Archiv_Tape\Lung Fibrosis\M242_14_RUL\pro...,"[True, True, True, True, True, True, True, Tru..."
2465,R://Archiv_Tape\Lung Fibrosis\M242_42_RUL\proj...,R://Archiv_Tape\Lung Fibrosis\M242_42_RUL\proj,proj,1895,1895,SkyScan1272,2017-03-03 16:04:17,True,0,[R://Archiv_Tape\Lung Fibrosis\M242_42_RUL\pro...,"[True, True, True, True, True, True, True, Tru..."
2784,R://Archiv_Tape\Rabbit-Grenoble\Rabbit-4-Greno...,R://Archiv_Tape\Rabbit-Grenoble\Rabbit-4-Greno...,proj,583,582,Skyscan1172,2015-04-22 11:55:35,False,1,[R://Archiv_Tape\Rabbit-Grenoble\Rabbit-4-Gren...,"[True, True, True, True, True, True, True]"
2907,R://Archiv_Tape\Liver-Semela\Liver-Semela\ds17...,R://Archiv_Tape\Liver-Semela\Liver-Semela\ds17...,proj,974,974,SkyScan1272,2018-07-31 05:11:31,True,0,[R://Archiv_Tape\Liver-Semela\Liver-Semela\ds1...,"[True, True, True, True, True, True, True, Tru..."
1954,R://Archiv_Tape\Roche\Mouse_Head\5_laser\proj\...,R://Archiv_Tape\Roche\Mouse_Head\5_laser\proj,proj,1991,1990,Skyscan1172,2016-10-12 18:56:08,False,1,[R://Archiv_Tape\Roche\Mouse_Head\5_laser\proj...,"[True, True, True, True, True, True, True, Tru..."


In [36]:
print('In total we checked %s log files' % len(Data))

In total we checked 3051 log files


In [37]:
Data.groupby(['SameSame','Difference','Machine']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LogFile,Folder,LastFolder,NumProjFromLog,NumProjFromDirectory,ScanDate,CheckedTIFFs,CheckedGood
SameSame,Difference,Machine,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,1,SkyScan1272,5,5,5,5,5,5,5,5
False,1,Skyscan1172,1077,1077,1077,1077,1077,1077,1074,1074
False,2,Skyscan1172,1,1,1,1,1,1,1,1
False,14,SkyScan1272,3,3,3,3,3,3,3,3
False,62,SkyScan1273,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
True,0,SkyScan1272,1451,1451,1451,1451,1451,1451,1447,1447
True,0,SkyScan1273,35,35,35,35,35,35,34,34
True,0,SkyScan1275,9,9,9,9,9,9,9,9
True,0,SkyScan2211,1,1,1,1,1,1,1,1


In [38]:
Data.groupby(['SameSame','Difference']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,LogFile,Folder,LastFolder,NumProjFromLog,NumProjFromDirectory,Machine,ScanDate,CheckedTIFFs,CheckedGood
SameSame,Difference,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,1,1082,1082,1082,1082,1082,1082,1082,1079,1079
False,2,1,1,1,1,1,1,1,1,1
False,14,3,3,3,3,3,3,3,3,3
False,62,1,1,1,1,1,1,1,1,1
False,78,1,1,1,1,1,1,1,1,1
False,...,...,...,...,...,...,...,...,...,...
False,3601,6,6,6,6,6,6,6,0,0
False,3602,2,2,2,2,2,2,2,0,0
False,3979,4,4,4,4,4,4,4,0,0
False,4331,3,3,3,3,3,3,3,0,0


In [90]:
# Check if there are any wrong TIFFs
# If there's no output from this cell, then there are no FALSE checked TIFFs :)
for c, row in Data.iterrows():
    if row['CheckedGood']:
        if False in row['CheckedGood']:
            print(row.Folder, row.CheckedGood)

In [40]:
# Output checks
for c, row in Data.iterrows():
    if row.Difference == 1:
        if '1172' in row.Machine:
            #print(row.ScanDate)
            pass
        elif '1272' in row.Machine:
            #print(row.ScanDate)
            pass
        else:
            print(row.LogFile)
    elif row.Difference > 1:
        if row.NumProjFromDirectory == 0:
            pass
            #print('%4s/%s: No projections found: Check %s' % (c, len(Data), row.LogFile))
        else:
            pass
            #print('%4s/%s: Check %s. The Logfile says we should have %s files, we found %s files' % (c, len(Data),
            #                                                                                        row.LogFile,
            #                                                                                         row.NumProjFromLog,
            #                                                                                         row.NumProjFromDirectory))

In [41]:
def doubleckeckdir(logfile, verbose=False):
    """
    See if there's a name mixup, and that's the reason we don't find anything
    Or the logfile is the first of a series of connected scans...
    """
    import re  # for regex searching
    if verbose:
        print(80*'-')
        print('Based on %s' % logfile)
        print('Checking number of TIF files in %s' % os.path.dirname(logfile))
    with open(logfile, 'r') as f:
        prefix = []
        for line in f:
            # Get the prefix of the image files, which is handy for oversize scans
            if 'Filen' in line and 'fix' in line:
                prefix = line.split('=')[1].strip()
    if not prefix:
        print('No "prefix" found in %s' % logfile)
        return()
    with open(logfile, 'r') as f:
        for line in f:
            # Get the prefix of the image files, which is handy for oversize scans
            if 'Number of connected scans' in line:
                connectedscans = int(line.split('=')[1].strip())
    if verbose:
        print('This scan is part of a %s-connected scan' % connectedscans)
    if verbose:
        print('We are only looking for files with %sNUMERALS.tif' % prefix)  
    # Search for all the first tiff files of  tiffiles, but only with with 'prefix' + numerals + .tif, hence exclding prefix_pp*.tif and prefix_ar*.tif
    # regex search based on https://stackoverflow.com/a/55810892/323100 and regexr.com/51lob
    tiffiles = [os.path.join(os.path.dirname(logfile), f) for f in os.listdir(os.path.dirname(logfile)) if re.search(rf'{prefix}([0-9])*\.tif', f)]
    if verbose:
        print('We found %s such files' % len(tiffiles))
    if not len(tiffiles):
        connectedfiles = [f for f in os.listdir(os.path.dirname(logfile)) if re.search(rf'{prefix}~0([0-9])0*\.tif', f)]
        try:
            if len(connectedfiles) == connectedscans:
                return('Connected')
        except:
            return('B0rked')
            print('Something is wrong with %s' % logfile)
    try:
        return(len(tiffiles))
    except:
        return('B0rked')

In [42]:
# Double-check for connected scan
Data['CheckConnected'] = [doubleckeckdir(l) for l in Data['LogFile']]

In [43]:
Data.sample(n=10)

Unnamed: 0,LogFile,Folder,LastFolder,NumProjFromLog,NumProjFromDirectory,Machine,ScanDate,SameSame,Difference,CheckedTIFFs,CheckedGood,CheckConnected
449,R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\...,R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble\...,proj,798,797,Skyscan1172,2015-02-19 14:54:38,False,1,[R://Archiv_Tape\Brain-Grenoble\Brain-Grenoble...,"[True, True, True, True, True, True, True, Tru...",797
2327,R://Archiv_Tape\Heart_Finland\Heart_Finland\RK...,R://Archiv_Tape\Heart_Finland\Heart_Finland\RK...,proj,798,0,Skyscan1172,2015-01-30 15:56:29,False,798,,,Connected
1872,R://Archiv_Tape\Zahnmedizin\ZM-Sample4\Test\ZM...,R://Archiv_Tape\Zahnmedizin\ZM-Sample4\Test,Test,358,0,Skyscan1172,2015-09-16 09:28:12,False,358,,,B0rked
792,R://Archiv_Tape\Israel Heads\Israel Heads\B1_h...,R://Archiv_Tape\Israel Heads\Israel Heads\B1_h...,proj,2167,2166,Skyscan1172,2019-05-07 01:00:07,False,1,[R://Archiv_Tape\Israel Heads\Israel Heads\B1_...,"[True, True, True, True, True, True, True, Tru...",2166
1614,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_D9-...,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_D9-...,proj,1328,1327,Skyscan1172,2014-04-01 21:12:45,False,1,[R://Archiv_Tape\Kidney\Kidney\Kidney_Right_D9...,"[True, True, True, True, True, True, True, Tru...",1327
2648,R://Archiv_Tape\Felsenbein Halm\Felsenbein Hal...,R://Archiv_Tape\Felsenbein Halm\Felsenbein Hal...,proj,974,974,SkyScan1272,2017-11-09 22:08:51,True,0,[R://Archiv_Tape\Felsenbein Halm\Felsenbein Ha...,"[True, True, True, True, True, True, True, Tru...",974
1663,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_DS8...,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_DS8...,proj,1328,0,Skyscan1172,2014-03-21 19:15:42,False,1328,,,Connected
1803,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_DS9...,R://Archiv_Tape\Kidney\Kidney\Kidney_Right_DS9...,proj,1328,1327,Skyscan1172,2014-02-27 18:46:53,False,1,[R://Archiv_Tape\Kidney\Kidney\Kidney_Right_DS...,"[True, True, True, True, True, True, True, Tru...",1327
2800,R://Archiv_Tape\Rabbit-Grenoble\Rabbit-1-Greno...,R://Archiv_Tape\Rabbit-Grenoble\Rabbit-1-Greno...,proj,1018,1017,Skyscan1172,2015-04-20 23:17:04,False,1,[R://Archiv_Tape\Rabbit-Grenoble\Rabbit-1-Gren...,"[True, True, True, True, True, True, True, Tru...",1017
998,R://Archiv_Tape\Immunology\Immunology\BumannA_...,R://Archiv_Tape\Immunology\Immunology\BumannA_...,proj2,739,85,Skyscan1172,2014-06-24 18:50:53,False,654,[R://Archiv_Tape\Immunology\Immunology\BumannA...,"[True, True]",85


In [45]:
# Save out the dataframe to disk, so we can use it later in another notebook
Data.to_pickle(os.path.join('ConsistencyCheck.pkl'))

In [48]:
# Save out the dataframe to disk, so we can send it to Ruslan and Oleksiy
Data.to_excel('ResearchStorageAll.xlsx')

In [49]:
Data.head()

Unnamed: 0,LogFile,Folder,LastFolder,NumProjFromLog,NumProjFromDirectory,Machine,ScanDate,SameSame,Difference,CheckedTIFFs,CheckedGood,CheckConnected
0,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,proj,1947,0,Skyscan1172,2019-04-25 01:26:36,False,1947,,,Connected
1,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,proj,1947,1946,Skyscan1172,2019-04-24 04:36:27,False,1,[R://Archiv_Tape\Neda\control01_lumbar_1_32um_...,"[True, True, True, True, True, True, True, Tru...",1946
2,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,proj,1947,1946,Skyscan1172,2019-04-24 20:14:05,False,1,[R://Archiv_Tape\Neda\control01_lumbar_1_32um_...,"[True, True, True, True, True, True, True, Tru...",1946
3,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,proj,1947,1946,Skyscan1172,2019-04-24 15:01:31,False,1,[R://Archiv_Tape\Neda\control01_lumbar_1_32um_...,"[True, True, True, True, True, True, True, Tru...",1946
4,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,R://Archiv_Tape\Neda\control01_lumbar_1_32um_5...,proj,1947,1946,Skyscan1172,2019-04-25 01:26:36,False,1,[R://Archiv_Tape\Neda\control01_lumbar_1_32um_...,"[True, True, True, True, True, True, True, Tru...",1946


In [135]:
# Save out the dataframe to disk, so we can send it to Ruslan and Oleksiy
Data[(Data.CheckConnected!='Connected') & (Data.Difference > 1)].to_excel('ResearchStorageMissing.xlsx')
print('We need to check %s folders, where we found an issue' % len(Data[(Data.CheckConnected!='Connected') & (Data.Difference > 1)]))
print('The folders to check have been saved to "ResearchStorageMissing.xlsx"')

We need to check 170 folders, where we found an issue
The folders to check have been saved to "ResearchStorageMissing.xlsx"


In [136]:
# How many TIFFs are 'none'-checked, how many are good?
print('Out of %s log files' % len(Data))
print('\t- %s have no corresponding TIFFs, e.g. were not checked' % len(Data[(Data.CheckedGood.isna())]))
howmanyfolders=0
howmanyfiles=0
for c, row in Data.iterrows():
    if row['CheckedGood']:
        if True in row['CheckedGood']:
            howmanyfiles+=len(row.CheckedGood)
            howmanyfolders+=1
print('\t- %s have (a total of %s) correponding TIFF files, which all passed our TIFF check' % (howmanyfolders, howmanyfiles))

Out of 3051 log files
	- 446 have no corresponding TIFFs, e.g. were not checked
	- 2605 have (a total of 36821) correponding TIFF files, which all passed our TIFF check


In [124]:
# Check if there are any wrong TIFF
for c, row in Data.iterrows():
    if row['CheckedGood']:
        if True in row['CheckedGood']:
            pass
        else:
            print('At least one file failed the check')
    else:
        pass

In [114]:
print('Of *all* the %s TIFF files on disk, we checked %0.3f%%, e.g. %s files' % (Data.NumProjFromDirectory.sum(),
                                                                                 howmanyfiles / Data.NumProjFromDirectory.sum() * 100,
                                                                                 howmanyfiles))

Of *all* the 3349934 TIFF files on disk, we checked 1.099%, e.g. 36821 files
