In [1]:
import numpy as np
import multiprocessing as mp
import re
import traceback

from glob import glob
from datetime import datetime
from zipfile import BadZipfile

In [2]:
filenames = sorted(list(glob('*.npz')))

In [3]:
number_simulations = 500

prettify = lambda integer: str(integer).zfill(len(str(number_simulations)))
simulation_numbers = list(range(1, number_simulations+1))

intended_filenames = ['CECNBDM_results.{}.npz'.format(prettify(simulation_number))
                      for simulation_number in simulation_numbers]

try:
    assert filenames == intended_filenames
except AssertionError:
    with open('../error_logs/file_status.log', 'a') as file_pointer:
        timestamp = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S')
        file_pointer.write('Some simulations are missing. timestamp={}\n'.format(timestamp))

In [4]:
def check_file_status(filename):
    filenumber = re.findall(r'CECNBDM_results\.([0-9]+)\.npz', filename)[0]
    
    with open('../notebook_logs/file_status.{}.log'.format(filenumber), 'a') as file_pointer:
        try:
            npzfile = np.load(filename)
            assert npzfile.files == ['droplets'], 'File does not have expected variables.'
            file_pointer.write('File #{} has expected variables.'.format(filenumber))
            droplets = npzfile['droplets']
            assert droplets.shape == (15000000, 91)
            file_pointer.write('Contents of file #{} have expected shape.'.format(filenumber))
            file_pointer.write('All checks for file #{} passed.'.format(filenumber))
            timestamp = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S')
            file_pointer.write('timestamp={}\n'.format(timestamp))
            
        except (FileNotFoundError, BadZipFile) as error_message: # file does not exist or is corrupted
            file_pointer.write('File #{} does not exist or is corrupted.\n'.format(filenumber))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())
            timestamp = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S')
            file_pointer.write('timestamp={}\n'.format(timestamp))
            
        except AssertionError as error_message:
            file_pointer.write('File #{} has failed some checks.\n'.format(filenumber))
            file_pointer.write(str(error_message))
            file_pointer.write(traceback.format_exc())
            timestamp = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S')
            file_pointer.write('timestamp={}\n'.format(timestamp))

In [5]:
%%time
try:
    pool = mp.Pool(40)
    pool.map(check_file_status, filenames)
    pool.close()
    pool.join()
except RuntimeError as error_message:
    timestamp_filename = datetime.now().strftime('%m-%d-%Y.at.%H_%M_%S.log')
    with open('../error_logs/{}'.format(timestamp_filename), 'a') as file_pointer:
        file_pointer.write(str(error_message))
        file_pointer.write(traceback.format_exc())

CPU times: user 246 ms, sys: 623 ms, total: 869 ms
Wall time: 15min 19s
