In [1]:
## JAC 31/07/2016
## This will read trip ends in EMME format and convert to csv
## The specified files for each matrix will be read,
## and a single csv with all the Trp-Ends will be produced.

In [2]:
import re
import glob
import pandas as pd
import os

In [4]:
## Prepare sorting functions
def sort_by_list_key_func(lst):
    '''Returns a function to be used for sorting.
    The returned function returns an integer
    with the index of each element in the input list,
    if the element is found in the element being sorted.
    Puts elemnts not founf at the beggining.
    
    >>> sorted('a b c A B C'.split(),
    ...         key=sort_by_list_key_func('b c a'.split()))
    ['A', 'B', 'C', 'b', 'c', 'a']
    
    '''
    lst_re = re.compile('({})'.format('|'.join(lst)))
    def sort_func(s):
        'Sorting function'
        try:
            return lst.index(lst_re.search(s).group())
        except:
            return -1
    return sort_func

In [5]:
def sort_df_by_lists(df, lists):
    '''Returns the input dataframe with columns sorted by the appearance of
    each element of each list in input lists in the column names,
    by order of appearance in the input lists
    
    >>> list(sort_df_by_lists(
    ...         pd.DataFrame(columns=['A1', 'A2', 'A3', 'B1', 'B2', 'B3']),
    ...         [['B', 'A'],['3', '1', '2']]
    ...         ).columns)
    ['B3', 'A3', 'B1', 'A1', 'B2', 'A2']
    
    '''
    ## Sort DataFrame's columns
    cols = list(df.columns)
    for lst in lists:
        cols.sort(key=sort_by_list_key_func(lst))
    return df[cols] #sorted! 

In [6]:
def ReadEMME_matrices(sFiles):
    '''Reads all EMME matrix files specified in sFiles list
    and returns a DataFrame, with the files combined:
    one column for each EMME matrix read.
    Note: EMME files can contain several matrices per file.
    
    Assumes one single value per row in the EMME files.
    '''

    EMMErecord_cols = {
        'md': ['zone', '_TD'],
        'mo': ['zone', '_TO'],
        'mf': ['O', 'D', '']
        } #TODO: remove difference by TO /TD / T ??
    
    ## RegEx to read EMME format:
    mat_re = re.compile(r'a matrix\s*=\s*(mo|md|mf|ms)(\d+)\s+(\w+?)\s+(-?\d+)\s+(.+?)\n(.*)\n(?=a matrix|d matrix|\Z|\s*\n)',
                       re.DOTALL | re.MULTILINE)
    # re groups:
    # mat_type, mat_num, mat_name, mat_default, mat_desc, mat_data

    EMMErecord_re = {
        'md': re.compile(r'\s*all\s+(\d+)\s*:\s*(-?\.?\d+\.?\d*)\n'),
        'mo': re.compile(r'\s*(\d+)\s+all\s*:\s*(-?\.?\d+\.?\d*)\n'),
        'mf': re.compile(r'\s*(\d+)\s+(\d+)\s*:\s*(-?\.?\d+\.?\d*)\n')
        } #TODO: implement ms
    
    ## Read Data
    #print('Reading data ...\n')
    data = {}
    for filep in sFiles:
        file = os.path.basename(filep)
        fn, fext = os.path.splitext(file)
        #print('\t',fn)
        data[fn] = {}
        with open(filep, 'r') as f:
            fcontent = f.read()
            #each source file might contain several matrices
            mats = mat_re.findall(fcontent)
            for mat in mats:
                mat_type, mat_num, mat_name, mat_default, mat_desc, mat_data = mat
                mat_rows = EMMErecord_re[mat_type].findall(mat_data)
                data[fn][mat_name] = dict(zip(
                   'mat_type, mat_num, mat_default, mat_desc, mat_rows'.split(', '),
                   [mat_type, mat_num, mat_default, mat_desc, mat_rows]))
    
    ## Convert to DataFrame
    data_df = pd.DataFrame()
    for fn in data:
        for mat in data[fn]:
            mat_data = data[fn][mat]

            #convert rows into df, setting column names and index
            df_cols = EMMErecord_cols[mat_data['mat_type']]
            df_idx_cols = df_cols[:-1]
            df_data_cols = df_cols[-1]
            # RELEASE THE PANDAS !!!   \,,/(>_<)\,,/
            df = pd.DataFrame.from_records(mat_data['mat_rows'],
                                           columns=df_cols,
                                           index=df_idx_cols)

            #this avoids repeating names:
            mat_id = '{}{}'.format(fn, df_data_cols)
            data_df[mat_id] = df[df_data_cols]           

    return data_df

In [7]:
## Export as csv
if __name__ == '__main__':
    # Test examples in docstrings
    import doctest
    doctest.testmod()
    
    
    ## HARDCODED
    ## Order in these lists matter, as they will be used for sorting
    TPs = 'AM IP PM'.split()

    DSegs = ['{}'.format(x+1) for x in range(7)]
    DSegs.extend('LGV HGV'.split())

    TEs = 'Or Ds'.split()

    # Specify the file name patter for the files that form each matrix:
    sFilesPatterns = {'Adj_Syn_TE': [r'..\Adj_Syn_TE\TE_*_Adj_*_??.txt']}
    
    
    print('\nReading data ...\n')
    sFiles_dict = {}
    for matrix, PatternList in sFilesPatterns.items():
        sFiles_dict[matrix] = []
        for pattern in PatternList:
            sFiles = glob.glob(pattern)
            sFiles_dict[matrix].extend(sFiles)

    sorting_lists = [DSegs, TPs, TEs]

    data_df = {k: ReadEMME_matrices(files)
                for k,files in sFiles_dict.items()} 
    
    data_df = {k: sort_df_by_lists(df, sorting_lists)
                for k,df in data_df.items()} 

    print('\nWritting data ...\n')
    for matrix in data_df:
        print(matrix)
        data_df[matrix].to_csv('{}.csv'.format(matrix))


Reading data ...


Writting data ...

Adj_Syn_TE
