In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

In [2]:
import glob

In [3]:
from Matrix import Matrix
from AuxFunctions import *

In [22]:
class TLD(pd.DataFrame):
    '''A Trip-length distribution DataFrame. Distance is index.
    Columns for time periods, segments, vehicles, etc.'''
    
    @staticmethod
    def nband(n):
        '''
        Returns a function that returns the corresponding n_band for x
        nband is a wrapper that parametrises n for a banding function
        Useful for pandas.groupby(nband(n))

        >>> [nband(2)(x) for x in range(7)]
        [0, 0, 2, 2, 4, 4, 6]
        '''
        return lambda x: int(x/n)*n

    def band_agg(self, n, current_bands=0):
        '''Aggregates to bands of n.
        current_bands - length of the interval. 0 to estimate it.'''
        
        if not current_bands:
            current_bands = self.index.get_level_values(-1)[1]
        
        if n < current_bands:
            ErrMsg = '''input n ({}) < TLD band aggregation ({})
                This function cannot be used to disaggregate a TLD'''.format(n, current_bands)
            raise ValueError(ErrMsg)
        
        TLDn = self.groupby(TLD.nband(n)).sum() #TLD by bands
        TLDn.index = TLDn.index + n #re-index to top end of each band
        TLDn.at[0,:]=0 #TLD with initial zero value
        return TLDn.sort_index()
    
    #TODO: Should this return a copy?
    def normalize(self):
        '''Normalizes TLD so TLD will contain proportion of trips 
        for each distance band rather thab absolute number of trips.'''
        return self.apply(lambda x: x / x.sum())
    
    #TODO: test this
    @property
    def norm(self):
        return self.normalize()
    
    def mid_band(self, level=0, factor=0.5, current_bands=0):
        '''Re-index to the medium point of the interval.
        level         - index level to use
        factor        - factor to apply to the interval
        current_bands - length of the interval. 0 to estimate it.'''
        
        idx = self.index

        if not current_bands:
            idx_increments = idx.get_level_values(level) - pd.Series(idx.get_level_values(level)).shift()
            idx_increments = pd.Series(idx_increments).dropna()
            current_bands = min(idx_increments)

        reidx = idx + current_bands * factor
        
        self.index = reidx
        
    def trim_index(self, index_names_to_keep='from', inplace=False):
        '''Wrapper for trim_index_df, adapted for TLD.'''
        return trim_index_df(self, index_names_to_keep, inplace)
    
    def to_numeric(self):
        '''Converts strings into numbers.'''
        tmp_index_names = self.index.names
        self.index = pd.to_numeric(TLD.index)
        self.index.names = tmp_index_names #I can't remmeber now why this is necessary
        return self.apply(lambda x: pd.to_numeric(x))
    
    def truncate(self, dist):
        '''Truncates based on the index values.'''
        return self.loc[self.index < dist]
    
    @property
    def avgdist(self, level=-1):
        '''Returns the average distances (weighted average, SUMPRODUCT)
        of the TLD columns. TLD should contain totals, not proportions.'''
        return self.apply(lambda x: (x * self.index.get_level_values(level)).sum())
    
    @staticmethod
    def from_dist_col(mat, dist_col=-1, dist_band=1, normalized=False):
        '''Returns the Trip-Lenght Distribution of mat, 
        based on dist_col, aggregated by dist_band.'''

        if isinstance(dist_col, int):
            dist_col = mat.columns[dist_col]

        tld = mat.copy()
        tld.ix[:,dist_col] = tld.ix[:,dist_col].apply(TLD.nband(dist_band))

        tld = tld.groupby(by=dist_col).sum()
        tld.index = tld.index + dist_band #top end of each band
        tld.at[0,:]=0 #fill initial zero value

        tld = tld.sort_index()
        
        tld = TLD(tld)
        
        if normalized:
            tld = tld.norm
            
        return tld

    @staticmethod
    def from_mat_single(mat, dist, dist_col=-1, dist_band=1, normalized=False):
        '''Returns the Trip-Lenght Distribution of mat, 
        based on distance (dist_col) form dist, aggregated by dist_band.
        mat can have any number of culumns, but only dist_col will be used
        for the TLD. dist_col admits integer and column name.'''

        if isinstance(dist_col, int):
            dist_col = dist.columns[dist_col]

        df = mat.join(dist.ix[:,[dist_col]]).fillna(0)
        tld = TLD.from_dist_col(df, dist_col, dist_band)
        
        tld = TLD(tld)
        
        if normalized:
            tld = tld.norm
        
        return tld
    
    @staticmethod
    def from_mat(mat, dist, dist_band=1, normalized=False):
        '''Returns the Trip-Length Distribution of mat.
        TLD for each mat column will be based on the corresponding
        column from dist (in order). mat and dist must have the same
        number of columns, or just the first distance column will be
        used.'''

        if len(mat.columns) != len(dist.columns):
            return TLD.from_mat_single(mat, dist, dist_band=dist_band, normalized=normalized)

        dfs = zip_df_cols([mat,dist])
        TLDs = [TLD.from_dist_col(df, dist_col=1,
                                    dist_band=dist_band,
                                    normalized=normalized)
                for df in dfs]

        tld = pd.DataFrame()
        for xtld in TLDs:
            tld = pd.concat([tld, xtld], axis=1)

        tld = TLD(tld)
        
        if normalized:
            tld = tld.norm 

        return tld
    
    @staticmethod
    def read_EMME_TLD(file):
        '''Returns TLD df from an EMME TLD report file, with columns:
        ['from','to','density_abs','density_norm','cumulative_abs','cumulative_norm']
        '''

        # EMME_TLD_cols - in order, position matters
        EMME_TLD_cols = ['from','to','density_abs','density_norm','cumulative_abs','cumulative_norm']

        idx_cols = EMME_TLD_cols[:2]
        data_cols = EMME_TLD_cols[2:]

        # RegEx to read EMME format:
        NumberPat = r'-?\.?\d*\.?\d+'
        TLDRowPat = r'(?<=\n)\s*({0})\s+({0})\s+({0})\s+({0})\s+({0})\s+({0})'
        EMMErecord_re = re.compile(TLDRowPat.format(NumberPat))

        # Read data
        with open(file, 'r') as f:
            f_content = f.read()
            data = EMMErecord_re.findall(f_content)

        # Convert data to DataFrame
        df = pd.DataFrame.from_records(data,
                                       columns=EMME_TLD_cols,
                                       index=idx_cols)
        return df
    
    @staticmethod
    def read_EMME_TLDs(files):
        '''Reads all TLD reports specified in files
        and returns four DataFrames, with the TLDs combined.
        Recomended: use glob to get the list of files from a pattern.
        Returns one DataFrame for each of the TLD EMME columns:
        ['density_abs','density_norm','cumulative_abs','cumulative_norm']
        '''
        TLDs = [read_EMME_TLD(file) for file in files]
        combinedTLDs = list(PairWiseColumnGroups(TLDs))

        filenames = [os.path.basename(file) for file in files]
        for tld in combinedTLDs:
            tld.columns = filenames
        density_abs, density_norm, cumulative_abs, cumulative_norm = combinedTLDs

        return density_abs, density_norm, cumulative_abs, cumulative_norm
    
    #TODO: Set xmax, ymax for x and y axes
    def to_JPG(self, OutputName='TLD.png', title='Trip-Length Distribution',
                   ylabel='Trips', units='',
                   legend=False, table_font_colors=True,
                   prefixes='', suffixes='',
                   *args, **kwargs):
        '''Produces a graph from TLD, all columns together.
        Includes average distance.
            prefixes         - to prepend to each column. Use as a marker.
            suffixes         - to append to each column. Use as a marker.
        '''

        if prefixes:
            try:
                self.columns = [prefix+col for col,prefix in zip(self.columns,prefixes)]
            except:
                raise ValueError("prefixes must have the same length as df.columns.")

        if suffixes:
            try:
                self.columns = [col+sufix for col,sufix in zip(self.columns,suffixes)]
            except:
                raise ValueError("suffixes must have the same length as df.columns.")

        if duplicates_in_list(self.columns):
            raise ValueError("Duplicate names in DataFrame's columns.")

        plt.clf()
        axs_subplot = self.plot(title=title, legend=legend)
        line_colors = [line.get_color() for line in axs_subplot.lines]

        if legend:
            lgd = plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
                              fancybox=True, ncol=len(TLD.columns))
        plt.xlabel('Dist')
        plt.ylabel(ylabel)

        if units:
            col_label = 'Avg Dist ({})'.format(units)
        else:
            col_label = 'Avg Dist'

        table = plt.table(
            cellText=[['{:,.2f}'.format(avgdist(TLD,col))] for col in TLD],
            colWidths = [0.1],
            rowLabels=[' {} '.format(col) for col in TLD],
            colLabels=[col_label],
            loc='upper right')
        #table.set_fontsize(16)
        table.scale(2, 2)

        if table_font_colors:
            for i in range(len(line_colors)):
                #table.get_celld()[(i+1, -1)].set_edgecolor(line_colors[i])
                table.get_celld()[(i+1, -1)].set_text_props(color=line_colors[i])

        oName = OutputName
        plt.savefig(oName, bbox_inches='tight')
        plt.close()
        
    def TLD_cols_to_JPGs(self, oFileNamePattern='TLD_{}.png', *args, **kwargs):
        '''Produces a graph for each column of TLD.
        Names based on oFileNamePattern and column names.
        Includes average distance.'''
        for col in self:
            oFname = oFileNamePattern.format(col)
            TLD.to_JPG(TLD[[col]], oFname, *args, **kwargs)
            
    #TODO: output average distances as DataFrame (and export as csv?)
    @staticmethod
    def comparison_to_JPGs(TLDs, oFileNamePattern='TLD_{}.png', *args, **kwargs):
        '''Produces comparison graphs of the columns in each TLD in TLDs list.
        Columns are taken pairwise, in positional order.
        Names based on column names.'''
        comparisonTLDs = zip_df_cols(TLDs)
        for TLD in comparisonTLDs:
            TLDname = '-'.join(TLD.columns)
            OutputName = oFileNamePattern.format(TLDname)
            TLD_to_JPG(TLD, OutputName, *args, **kwargs)

In [10]:
## TODO: Re test below using the new class

In [6]:
ex_matrixf = os.path.join('example_data', 'ex_matrix_1.csv')
ex_matrix = Matrix(pd.DataFrame.from_csv(ex_matrixf, index_col=[0,1]))
ex_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,T
zone_O,zone_D,Unnamed: 2_level_1
1001,1001,16.0
1001,1002,8.0
1001,1003,16.0
1001,1004,6.0
1001,1005,2.0
1001,1006,6.0
1001,1007,3.0
1001,1008,6.0
1001,1009,6.0
1001,1010,10.0


In [7]:
ex_skimdistf = os.path.join('example_data', 'ex_skimdist_1.csv')
ex_skimdist = Matrix(pd.DataFrame.from_csv(ex_skimdistf, index_col=[0,1]))
ex_skimdist

Unnamed: 0_level_0,Unnamed: 1_level_0,meters
zone_O,zone_D,Unnamed: 2_level_1
1001,1002,395.00003
1001,1003,625.00000
1001,1004,1045.00012
1001,1005,1225.00012
1001,1006,2404.99951
1001,1007,784.99988
1001,1008,1345.00000
1001,1009,1699.99976
1001,1010,1610.00012
1001,1011,1965.00000


In [27]:
#infill intrazonal:
ex_skimdist.fill_intrazonals(0)

TypeError: fill_intrazonals() missing 1 required positional argument: 'func'

In [26]:
ex_matrix.join(ex_skimdist)

Unnamed: 0_level_0,Unnamed: 1_level_0,T,meters
zone_O,zone_D,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,1001,16.0,
1001,1002,8.0,395.00003
1001,1003,16.0,625.00000
1001,1004,6.0,1045.00012
1001,1005,2.0,1225.00012
1001,1006,6.0,2404.99951
1001,1007,3.0,784.99988
1001,1008,6.0,1345.00000
1001,1009,6.0,1699.99976
1001,1010,10.0,1610.00012


In [23]:
ex_TLD = TLD.from_mat(ex_matrix, ex_skimdist, 5)
ex_TLD

AttributeError: 'float' object has no attribute 'fillna'

In [None]:
ex_TLD.sum()

In [25]:
from MatrixExamples import mat



In [26]:
mat

Unnamed: 0_level_0,Unnamed: 1_level_0,T1,T2,T3
O,D,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,1,0
1,2,1,2,6
1,3,1,3,10
1,4,0,4,12
1,5,1,5,12
1,6,1,6,10
1,7,0,7,6
2,1,1,8,0
2,2,1,9,0
2,3,0,10,10


In [27]:
dst = mat.copy()
dst['T1'] = (dst.index.get_level_values(0)**2 - dst.index.get_level_values(1)**2)**2
dst['T2'] = dst['T1'] / dst.index.get_level_values(0)
dst['T2'] = dst['T1'] / dst.index.get_level_values(1)
dst.columns = 'D1 D2 D3'.split()
dst

Unnamed: 0_level_0,Unnamed: 1_level_0,D1,D2,D3
O,D,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0.0,0
1,2,9,4.5,6
1,3,64,21.333333,10
1,4,225,56.25,12
1,5,576,115.2,12
1,6,1225,204.166667,10
1,7,2304,329.142857,6
2,1,9,9.0,0
2,2,0,0.0,0
2,3,25,8.333333,10


In [28]:
TLD_single = TLD_SingleDist(mat,dst,5)
TLD_single

Unnamed: 0_level_0,T1,T2,T3
D1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,0.0
5,4.0,175.0,0.0
10,2.0,10.0,6.0
30,0.0,26.0,16.0
50,2.0,42.0,22.0
65,2.0,18.0,10.0
85,2.0,58.0,24.0
125,0.0,74.0,22.0
145,2.0,34.0,18.0
170,2.0,90.0,16.0


In [29]:
TLD_multi = TLD_MultiDist(mat,dst,5)
TLD_multi

Unnamed: 0,T1,T2,T3
0,0.0,0.0,0.0
5,4.0,177.0,0.0
10,2.0,18.0,72.0
15,,34.0,264.0
20,,50.0,
25,,151.0,
30,0.0,48.0,
40,,11.0,
50,2.0,,
55,,19.0,


In [30]:
mat.sum()

T1      32
T2    1225
T3     336
dtype: int64

In [31]:
TLD_single.sum()

T1      32.0
T2    1225.0
T3     336.0
dtype: float64

In [32]:
TLD_multi.sum()

T1      32.0
T2    1225.0
T3     336.0
dtype: float64

In [33]:
normalize_TLD(TLD_multi).sum()

T1    1.0
T2    1.0
T3    1.0
dtype: float64

In [34]:
band_agg_TLD(TLD_multi, 10).sum()

T1      32.0
T2    1225.0
T3     336.0
dtype: float64

In [35]:
OutputName = os.path.join('example_outputs', 'TLD.png')
TLD_to_JPG(TLD_multi, OutputName=OutputName)

In [36]:
oFileNamePattern = os.path.join('example_outputs', 'TLD_{}.png')
TLD_cols_to_JPGs(TLD_multi, oFileNamePattern=oFileNamePattern)

In [37]:
TLD1 = TLD_multi.copy()
TLD2 = TLD_multi.copy() + 3
TLD3 = TLD_multi.copy()
TLD3 = TLD3.apply(lambda x: x + TLD3.index.get_level_values(0))
TLDs = [TLD1, TLD2, TLD3]
i = 1
for TLD in TLDs:
    TLD.columns = ['mat{}_{}'.format(i,col) for col in TLD]
    i+=1

In [38]:
oFileNamePattern = os.path.join('example_outputs', 'TLD_{}.png')
TLD_comparison_to_JPGs(TLDs, oFileNamePattern=oFileNamePattern)

In [39]:
TLD_multi

Unnamed: 0,T1,T2,T3
0,0.0,0.0,0.0
5,4.0,177.0,0.0
10,2.0,18.0,72.0
15,,34.0,264.0
20,,50.0,
25,,151.0,
30,0.0,48.0,
40,,11.0,
50,2.0,,
55,,19.0,


In [65]:
mid_interval_TLD(TLD_multi)

Unnamed: 0,T1,T2,T3
2.5,0.0,0.0,0.0
7.5,4.0,177.0,0.0
12.5,2.0,18.0,72.0
17.5,,34.0,264.0
22.5,,50.0,
27.5,,151.0,
32.5,0.0,48.0,
42.5,,11.0,
52.5,2.0,,
57.5,,19.0,
