In [1]:
import numpy as np
import os
import re

class AmesTypeError(Exception):
    pass

class HeaderError(Exception):
    pass

class IncompleteRowError(Exception):
    pass

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def parse_ames_1001_file(filename):
    try:      
        #open file and read contents
        f = open(filename, "r")
        lines = f.readlines()
        
        #check file type
        file_type = int(lines[0].split()[1])
        if file_type != 1001:
            raise AmesTypeError
            
        #get string array with variable labels
        num_header_lines = int(lines[0].split()[0])
        var_names = lines[num_header_lines - 1].split()
        
        #get file date
        infile_date = lines[6].split()[0:3]
        if len(infile_date[1]) == 1:
            infile_date[1] = '0' + infile_date[1]
        if len(infile_date[2]) == 1:
            infile_date[2] = '0' + infile_date[2]
            
        #get values for each data column which indicate an error has occured according to ames file format
        n_vars = int(lines[9]) + 1 #counting independent variable 
        first_scale_factor_line = lines[10].split()
        if len(first_scale_factor_line) != 1: #ames 'condensed' notation
            err_vals = np.array(lines[11].split()).astype(np.float)
        else:
            err_vals = np.array([float(lines[9 + n_vars + i]) for i in range(n_vars - 1)])   
            
        #make sure number of header lines indicated is correct
        if not is_number(lines[num_header_lines].split()[0]):
            raise HeaderError        

        #extract numerical data, removing empty lines (sometimes at the end of files)
        data = np.array([np.array(line.split()).astype(np.float) for line in lines[num_header_lines:-1] if line.split() != []])

        #check for rows with any column entries missing
        incomplete = False
        incomplete_rows = []
        for r, row in enumerate(data):
            if np.shape(row)[0] != n_vars:
                incomplete = True
                incomplete_rows.append((row[0], r + num_header_lines)) #records tuple of the form: (time var value, line number)
        if incomplete:
            raise IncompleteRowError

        return {'filename': filename, 'infile_date': infile_date,
                'var_names': var_names, 'err_vals': err_vals, 'data': data}
    
    except AmesTypeError:
        return {'filename': filename, 'error_type': 'ames', 'error_data': file_type}
        
    except HeaderError:
        return {'filename': filename, 'error_type': 'header', 
                'error_data': {'num_header_lines_reported': num_header_lines, 
                               'line_at_reported_start': lines[num_header_lines]}}
    
    except IncompleteRowError:
        return {'filename': filename, 'error_type': 'incomplete', 'error_data': incomplete_rows}
    
    except ValueError:
        #build data array manually, replacing any string-valued entries with the numerical code specified in header
        data = np.zeros([len(lines[num_header_lines:-1]), n_vars])
        for i, line in enumerate(lines[num_header_lines:-1]):
            row_str = line.split()
            for j, elt_str in enumerate(row_str):
                if is_number(elt_str):
                    elt = float(elt_str)
                else:
                    elt = err_vals[j]
                data[i,j] = elt
                
        #check for rows with any column entries missing
        incomplete = False
        incomplete_rows = []
        for r, row in enumerate(data):
            if np.shape(row)[0] != n_vars:
                incomplete = True
                incomplete_rows.append((row[0], r + num_header_lines)) #records tuple of the form: (time var value, line number)
        if incomplete:
            raise IncompleteRowError
                
        return {'filename': filename, 'infile_date': infile_date,
                'var_names': var_names, 'err_vals': err_vals, 'data': data}
    
    except:
        return{'filename': filename, 'error_type': 'unknown'}


In [2]:
all_good = []
ames_error = []
header_error = []
incomplete_error = []
other_error = []
wierd_ones = []

for filename in os.listdir('.'):
    if 'ames' in filename:
#     if filename[0] != '.' and ('ipynb' not in filename):
        print(filename) #for progress tracking
        file_data_dict = parse_ames_1001_file(filename)
        if 'error_type' in file_data_dict.keys():
            if file_data_dict['error_type'] == 'ames':
                ames_error.append(file_data_dict)
            elif file_data_dict['error_type'] == 'header':
                header_error.append(file_data_dict)
            elif file_data_dict['error_type'] == 'incomplete':
                incomplete_error.append(file_data_dict)
            elif file_data_dict['error_type'] == 'unknown':
                other_error.append(file_data_dict)
        else:
            all_good.append(file_data_dict)
    

HALO-DB_dataset1388_release1_adlr_20140901a_v02.ames
HALO-DB_dataset1389_release1_adlr_20140902a_v02.ames
HALO-DB_dataset1390_release1_adlr_20140906a_v02.ames
HALO-DB_dataset1391_release1_adlr_20140909a_v02.ames
HALO-DB_dataset1392_release1_adlr_20140911a_v02.ames
HALO-DB_dataset1393_release2_adlr_20140916a_v03.ames
HALO-DB_dataset1394_release2_adlr_20140918a_v03.ames
HALO-DB_dataset1395_release1_adlr_20140919a_v02.ames
HALO-DB_dataset1396_release1_adlr_20140921a.ames
HALO-DB_dataset1397_release1_adlr_20140923a.ames
HALO-DB_dataset1398_release1_adlr_20140925a.ames
HALO-DB_dataset1436_release1_adlr_20140927a.ames
HALO-DB_dataset1469_release1_sharc_20140906a.ames
HALO-DB_dataset1470_release1_sharc_20140909a.ames
HALO-DB_dataset1471_release1_sharc_20140911a.ames
HALO-DB_dataset1472_release1_sharc_20140916a.ames
HALO-DB_dataset1473_release1_sharc_20140918a.ames
HALO-DB_dataset1474_release1_sharc_20140919a.ames
HALO-DB_dataset1475_release1_sharc_20140921a.ames
HALO-DB_dataset1476_release1_s

HALO-DB_dataset3350_release3_UHSASA_AC11_20140916_R12.ames
HALO-DB_dataset3351_release3_UHSASA_AC12_20140918_R12.ames
HALO-DB_dataset3352_release3_UHSASA_AC13_20140919_R12.ames
HALO-DB_dataset3353_release3_UHSASA_AC14_20140921_R12.ames
HALO-DB_dataset3354_release3_UHSASA_AC15_20140923_R12.ames
HALO-DB_dataset3355_release3_UHSASA_AC16_20140925_R12.ames
HALO-DB_dataset3356_release3_UHSASA_AC18_20140928_R12.ames
HALO-DB_dataset3357_release3_UHSASA_AC19_20140930_R12.ames
HALO-DB_dataset3358_release3_UHSASA_AC20_20141001_R12.ames
HALO-DB_dataset3359_release3_UHSASA_AC21_20141003_R12.ames
HALO-DB_dataset3360_release3_UHSASA_AC22_20141004_R12.ames
HALO-DB_dataset3361_release1_AC20_20141001_pcasp.ames
HALO-DB_dataset3362_release1_AC08_20140909_pcasp.ames
HALO-DB_dataset3363_release1_AC18_20140928_pcasp.ames
HALO-DB_dataset3364_release1_AC17_20140927_pcasp.ames
HALO-DB_dataset3365_release1_AC16_20140925_pcasp.ames
HALO-DB_dataset3366_release1_AC15_20140923_pcasp.ames
HALO-DB_dataset3367_release

HALO-DB_dataset3623_release3_CAS_DPOL_AC13__19_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3624_release3_CAS_DPOL_AC14__21_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3625_release3_CAS_DPOL_AC15__23_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3626_release4_CAS_DPOL_AC16__25_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3627_release3_CAS_DPOL_AC17__27_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3628_release3_CAS_DPOL_AC18__28_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3629_release3_CAS_DPOL_AC19__30_09_2014_ACRIDICON_v3.ames
HALO-DB_dataset3630_release3_CAS_DPOL_AC20__01_10_2014_ACRIDICON_v3.ames
HALO-DB_dataset3631_release3_CAS_DPOL_AC21__03_10_2014_ACRIDICON_v3.ames
HALO-DB_dataset3632_release3_CAS_DPOL_AC22__04_10_2014_ACRIDICON_v3.ames
HALO-DB_dataset3697_release1_ACRIDICONCHUVA_SMART_AC05_20140901a.ames
HALO-DB_dataset3698_release1_ACRIDICONCHUVA_SMART_20140902a.ames
HALO-DB_dataset3699_release1_ACRIDICONCHUVA_SMART_20140906a.ames
HALO-DB_dataset3700_release1_ACRIDICONCHUVA_SMART_20140909a.ames
HALO-

In [4]:
# FIRST RUN #

print(len(all_good), ' files processed with no error.')
print()
print(len(ames_error), ' files not in ames 1001-type format:')
for thing in ames_error:
    print(thing['filename'])
print()    
print(len(header_error), ' files with header error:')
for thing in header_error:
    print(thing['filename'])
print()    
print(len(incomplete_error), ' files with incomplete data rows:')
for thing in incomplete_error:
    print(thing['filename'])
print()
print(len(other_error), ' files with unknown error:')
for thing in other_error:
    print(thing['filename'])

321  files processed with no error.

18  files not in ames 1001-type format:
HALO-DB_dataset3649_release1_LPDMFLEXPARTAC5.ames
HALO-DB_dataset3650_release1_LPDMFLEXPARTAC6.ames
HALO-DB_dataset3651_release1_LPDMFLEXPARTAC7.ames
HALO-DB_dataset3652_release1_LPDMFLEXPARTAC8.ames
HALO-DB_dataset3653_release1_LPDMFLEXPARTAC9.ames
HALO-DB_dataset3654_release1_LPDMFLEXPARTAC10.ames
HALO-DB_dataset3655_release1_LPDMFLEXPARTAC11.ames
HALO-DB_dataset3656_release1_LPDMFLEXPARTAC12.ames
HALO-DB_dataset3657_release1_LPDMFLEXPARTAC13.ames
HALO-DB_dataset3658_release1_LPDMFLEXPARTAC14.ames
HALO-DB_dataset3659_release1_LPDMFLEXPARTAC15.ames
HALO-DB_dataset3661_release1_LPDMFLEXPARTAC16.ames
HALO-DB_dataset3662_release1_LPDMFLEXPARTAC17.ames
HALO-DB_dataset3663_release1_LPDMFLEXPARTAC18.ames
HALO-DB_dataset3664_release1_LPDMFLEXPARTAC19.ames
HALO-DB_dataset3665_release1_LPDMFLEXPARTAC20.ames
HALO-DB_dataset3666_release1_LPDMFLEXPARTAC21.ames
HALO-DB_dataset3667_release1_LPDMFLEXPARTAC22.ames

37  files

In [None]:
# NOTES

# In addition to edits to headers, the following in-file dates were corrected:
# dataset 2946: corrected to 09/16
# dataset 3025: corrected to 09/23
# dataset 3468: corrected to 10/01
# dataset 3651: corrected to 09/06

# dataset 3016: data are missing

# datasets 3019-3033: actually have too many columns...

# dataset 3052: timestamp missing line 342; I interpolated to 60740

# dataset 3917: appears to be missing a column

# dataset 3929: second two copies of data were deleted

In [3]:
# SECOND RUN #

print(len(all_good), ' files processed with no error.')
print()
print(len(ames_error), ' files not in ames 1001-type format:')
for thing in ames_error:
    print(thing['filename'])
print()    
print(len(header_error), ' files with header error:')
for thing in header_error:
    print(thing['filename'])
print()    
print(len(incomplete_error), ' files with incomplete data rows:')
for thing in incomplete_error:
    print(thing['filename'])
print()
print(len(other_error), ' files with unknown error:')
for thing in other_error:
    print(thing['filename'])

351  files processed with no error.

18  files not in ames 1001-type format:
HALO-DB_dataset3649_release1_LPDMFLEXPARTAC5.ames
HALO-DB_dataset3650_release1_LPDMFLEXPARTAC6.ames
HALO-DB_dataset3651_release1_LPDMFLEXPARTAC7.ames
HALO-DB_dataset3652_release1_LPDMFLEXPARTAC8.ames
HALO-DB_dataset3653_release1_LPDMFLEXPARTAC9.ames
HALO-DB_dataset3654_release1_LPDMFLEXPARTAC10.ames
HALO-DB_dataset3655_release1_LPDMFLEXPARTAC11.ames
HALO-DB_dataset3656_release1_LPDMFLEXPARTAC12.ames
HALO-DB_dataset3657_release1_LPDMFLEXPARTAC13.ames
HALO-DB_dataset3658_release1_LPDMFLEXPARTAC14.ames
HALO-DB_dataset3659_release1_LPDMFLEXPARTAC15.ames
HALO-DB_dataset3661_release1_LPDMFLEXPARTAC16.ames
HALO-DB_dataset3662_release1_LPDMFLEXPARTAC17.ames
HALO-DB_dataset3663_release1_LPDMFLEXPARTAC18.ames
HALO-DB_dataset3664_release1_LPDMFLEXPARTAC19.ames
HALO-DB_dataset3665_release1_LPDMFLEXPARTAC20.ames
HALO-DB_dataset3666_release1_LPDMFLEXPARTAC21.ames
HALO-DB_dataset3667_release1_LPDMFLEXPARTAC22.ames

0  files 

In [None]:
# MORE NOTES

# dataset 3421: corrected n_vars to 18

# dataset 3915: lines 6012-6016 missing LWC; I changed to error flag value

# dataset 3916: lines 13966-13983 missing LWC; I changed to error flag value

# dataset 3917: missing a column

# dataset 3922: lines [13690, 13693, 13694, 13782, 13784, 13788, 13789, 13790, 
# 13791, 13792, 13793, 13794, 18929, 18930, 18931, 18932, 18934, 18935, 19527, 
# 19528, 19529, 19531, 19532, 19533, 19535, 19536, 19537] + 1 missing LWC; I changed to error flag value

# dataset 3923: lines [3644, 3645, 3646, 3647, 3741, 3742, 3743, 3744, 3745, 
# 3746, 3747, 3772, 3773, 3826, 3827, 3828, 3829, 3830, 3831, 3832, 3833, 3938, 
# 3939, 3940, 3941, 3942, 3943, 3944, 4887, 4888, 4889, 4890, 4891, 7262, 7263, 
# 7264, 7265] + 1 missing LWC; I changed to error flag value

# dataset 3924: lines [5152, 12132, 12133, 12134, 12135, 13244, 13245, 13246, 
# 13247, 13248, 13249, 13452, 13453, 13454, 13455, 13456, 13457, 13458] + 1 
# missing LWC; I changed to error flag value

# dataset 3925: lines [21269, 21277, 21355, 21471, 21505, 21588, 21701, 21776, 
# 21777, 21778, 21779, 21780, 21781, 21782, 21783, 21784, 21785, 21786, 21787, 
# 21788, 21789, 21790, 21791, 21792, 21793, 21794, 21795, 21796, 21797, 21798, 
# 21800, 21801, 21802, 21803, 21804, 21805, 21806, 21807, 21808, 21809, 21810, 
# 21811, 21812, 21813, 21814, 21815, 21816, 21817, 21818, 21819, 21820, 21821, 
# 21822, 21823, 21824, 21825, 21826, 21827, 21828, 21829, 21830, 21831, 21832, 
# 21833, 21834, 21835, 21836, 21837, 21838, 21880, 21881, 21882, 21962, 21963, 
# 21972, 21973, 21974, 21975, 21984, 21985, 21986, 21987, 21988, 22033, 22034, 
# 22090, 22095, 22114, 22115, 22116, 22117, 22118, 22119, 22120, 22121, 22122, 
# 22123, 22124, 22125, 22126, 22232, 22234, 22253, 22257, 22258, 22260, 22424, 
# 23217, 23237, 23238, 23240, 23241, 23242, 23244, 23245, 23247, 23253, 23256, 
# 23260, 23261, 23275, 23276, 23278, 23279, 23512, 23513, 23515, 23516, 23520, 
# 23521, 23522, 23523, 23524, 23525, 23526, 23528, 23530, 23531, 23532, 23533, 
# 23534, 23535, 23536, 23537, 23539, 23540, 23541, 23544, 23546, 23547, 23548, 
# 23551, 23560, 24258, 24259, 24260, 24264] + 1 missing LWC; I changed to error flag value

# dataset 3927: lines [21853, 21854, 21855, 21856, 21857, 21858, 21859, 21860, 
# 21861, 21862, 21863, 21864, 21865, 21866, 21867, 21868, 21869, 21870, 21871, 
# 21872, 21873, 21874, 21875, 21876, 21877, 21878, 21879, 21880, 21881, 21882, 
# 21883, 21884, 21885, 21886, 21887, 21888, 21889, 21890, 21891, 21892, 21893, 
# 21894, 21895, 21896, 21897, 21898, 21899, 21900, 21901, 21902, 21903, 21904, 
# 21905, 21906, 21907, 21908, 21909, 21910, 21911, 21912, 21913, 21914, 21915, 
# 21916, 21917, 21918, 21919, 21920, 21921, 21922, 21923, 21924, 21925, 21926, 
# 21927, 21928, 21929, 21930, 21931, 21932, 21933, 21934, 21935, 21936] missing LWC; I changed to error flag value

# datasets labeled 'image_analysis' were excluded for third run

# datasets 3649-6367 (non-1001 type) were excluded for third run

In [26]:
# automated correction of incomplete rows, following second run (other ones I did manually but these were too long)
# 'COPY' files subsequently replaced original files (manually).
filenames = ['HALO-DB_dataset3925_release1_Hotwire_AC20__01_10_2014_ACRIDICON.ames', 
             'HALO-DB_dataset3927_release1_Hotwire_AC22__04_10_2014_ACRIDICON.ames']
new_filenames = ['HALO-DB_dataset3925_release1_Hotwire_AC20__01_10_2014_ACRIDICON_COPY.ames', 
             'HALO-DB_dataset3927_release1_Hotwire_AC22__04_10_2014_ACRIDICON_COPY.ames']
inds = [21, 22]

for i, filename in enumerate(filenames):
    f = open(filename, "r")
    lines = f.readlines()
    f.close()
    
    new_f = open(new_filenames[i], "w")
    new_lines = [0 for i in range(len(lines))]
    
    err_line_inds = []
    for tup in incomplete_error[inds[i]]['error_data']:
        err_line_inds.append(tup[1])
    
    for j, line in enumerate(lines):
        if j in err_line_inds:
            split_line = line.split()
            new_lines[j] = split_line[0] + ' ' + '-9999.0000' + ' ' + split_line[1] + '\n'
        else:
            new_lines[j] = line
    
    new_f.writelines(new_lines)
    new_f.close()

In [3]:
# THIRD RUN #

print(len(all_good), ' files processed with no error.')
print()
print(len(ames_error), ' files not in ames 1001-type format:')
for thing in ames_error:
    print(thing['filename'])
print()    
print(len(header_error), ' files with header error:')
for thing in header_error:
    print(thing['filename'])
print()    
print(len(incomplete_error), ' files with incomplete data rows:')
for thing in incomplete_error:
    print(thing['filename'])
print()
print(len(other_error), ' files with unknown error:')
for thing in other_error:
    print(thing['filename'])

360  files processed with no error.

0  files not in ames 1001-type format:

0  files with header error:

15  files with incomplete data rows:
HALO-DB_dataset3019_release1_AC09_2014_09_11_CAS_V1.ames
HALO-DB_dataset3020_release1_AC10_2014_09_12_CAS_V1.ames
HALO-DB_dataset3021_release1_AC11_2014_09_16_CAS_V1.ames
HALO-DB_dataset3022_release1_AC12_2014_09_18_CAS_V1.ames
HALO-DB_dataset3023_release1_AC13_2014_09_19_CAS_V1.ames
HALO-DB_dataset3024_release1_AC14_2014_09_21_CAS_V1.ames
HALO-DB_dataset3025_release1_AC15_2014_09_23_CAS_V1.ames
HALO-DB_dataset3026_release1_AC16_2014_09_25_CAS_V1.ames
HALO-DB_dataset3027_release1_AC17_2014_09_27_CAS_V1.ames
HALO-DB_dataset3029_release1_AC18_2014_09_28_CAS_V1.ames
HALO-DB_dataset3030_release1_AC19_2014_09_30_CAS_V1.ames
HALO-DB_dataset3031_release1_AC20_2014_10_01_CAS_V1.ames
HALO-DB_dataset3032_release1_AC21_2014_10_03_CAS_V1.ames
HALO-DB_dataset3033_release1_AC22_2014_10_04_CAS_V1.ames
HALO-DB_dataset3917_release1_CAS_DPOL_AC12__18_09_2014_ACRI

In [6]:
# store all_good variable; call %store -r to reopen in future sessions
%store all_good

Stored 'all_good' (list)


In [12]:
# Convert to csv files
import csv

for i, file in enumerate(all_good):
    new_filename = file['filename'][0:len(file['filename'])-5]+'.csv'
    new_data = []
    new_data.append(file['infile_date'])
    new_data.append(file['var_names'])
    new_data.append(file['err_vals'])
    for thing in file['data']:
        new_data.append(thing)
    with open(new_filename, 'w', newline='') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(new_data)
    writeFile.close()

In [1]:
%store -r

In [6]:
with open('good_ames_files.txt', 'w') as writeFile:
        for file in all_good:
            writeFile.write(file['filename']+'\n')
writeFile.close()