# More analysis of initial system-models

Includes:
1. Analysis of with-turnstile-percentage threshold evolution

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import plotly.plotly
import plotly.tools as tls
import plotly.graph_objs as go

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import numpy as np
import datetime as dt
import time
import math

In [3]:
import statsmodels.api as sm


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



In [4]:
from Utils import TransantiagoConstants

In [5]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

In [6]:
daily_input_path = os.path.join(DTPM_TRXDir,'3_DAILY/daily_summary.csv')
daily_trx = pd.read_csv(daily_input_path,sep=';',encoding='latin-1', index_col=0)

### DAILY AT SYSTEM LEVEL. CREATING NEW DEPENDENT AND INDEPENDENT VARIABLES

In [7]:
daily_trx.loc[:,'TOTAL_trx'] = daily_trx.loc[:,'pn_SUM_TRX_no_t'] + daily_trx.loc[:,'pn_SUM_TRX_3t'] + daily_trx.loc[:,'pn_SUM_TRX_tm'] + daily_trx.loc[:,'zp_SUM_TRX']

In [8]:
independent_variables_path = os.path.join(DTPM_TRXDir,'0_INDEPENDENTS/independents_variables.csv')
independent_variables = pd.read_csv(independent_variables_path,sep=';',encoding='latin-1', index_col=0, parse_dates=[1])

In [9]:
independent_variables.loc[:,'Verano'] =  independent_variables.loc[:,'Enero'] + independent_variables.loc[:,'Febrero']
independent_variables.loc[:,'Nov_Dic_2017'] = independent_variables.loc[:,'Nov_2017'] + independent_variables.loc[:,'Dic_2017']
independent_variables.loc[:,'WEEK_OF_YEAR'] = independent_variables.loc[:,'DATE'].apply(lambda x: x.week)
independent_variables = pd.get_dummies(independent_variables, columns=['WEEK_OF_YEAR'])

In [10]:
complete_db = daily_trx.merge(independent_variables, on =['YEAR','MONTH','YEAR_DAY'], how='left')

In [11]:
complete_db.sort_values(by=['YEAR','MONTH','YEAR_DAY'], ascending=[True,True,True], inplace=True)

## Dataframe to summarize results for dummy analyses

In [12]:
dummies_summary = pd.DataFrame()

In [13]:
def summarizingResults(x, df, i):
    params = x.params.rename('params_' + str(i)) #x.params is a pandas series. The rename call is to assign a name to the series.
    t = x.tvalues.rename('t_' + str(i)) #x.tvalues is a pandas series
    p = x.pvalues.rename('p_' + str(i)) #x.pvalues is a pandas series

    results = pd.concat([params, t, p], axis=1).T    
    df = pd.concat([df, results], axis=0)
    
    return df

## Begin analyses for dummy variables...

* Selected model to analyse evolution of 'r_tm_x_rtm_i' variable is M8

In [14]:
max_ratio = math.ceil(complete_db.loc[complete_db['ratio_tm']==complete_db['ratio_tm'].max(),'ratio_tm'].item())

In [15]:
for i in range(0,max_ratio+1):
    
    complete_db.loc[:,'rtm_i'] = np.where((i<=complete_db.loc[:,'ratio_tm']),1,0)
    complete_db.loc[:,'r_tm_x_rtm_i'] = complete_db.loc[:,'ratio_tm'] * complete_db.loc[:,'rtm_i']
    
    Y = complete_db.loc[:,'TOTAL_trx']
    X = complete_db.loc[:,['SATURDAY',
                           'SUNDAY',
                           'ratio_tm',
                           'r_tm_x_rtm_i',
                           'kms_ofertados',
                           'WEEK_OF_YEAR_52',
                           'WEEK_OF_YEAR_53',
                           'WEEK_OF_YEAR_1',
                           'WEEK_OF_YEAR_2',
                           'WEEK_OF_YEAR_3',
                           'WEEK_OF_YEAR_4',
                           'WEEK_OF_YEAR_5',
                           'WEEK_OF_YEAR_6',
                           'WEEK_OF_YEAR_7',
                           'WEEK_OF_YEAR_8',
                           'WEEK_OF_YEAR_9',
                           'Julio',
                           't',
                           'Feriado_laboral',
                           'Feriado_no_laboral',
                           'FDS_Largo',
                           'Disturbios',
                           'Clima',
                           'Partido',
                           'Censo',
                           'visperas_laborales',
                           'N_ZPs']]
    X = sm.add_constant(X)
    m = sm.OLS(Y, X)
    results = m.fit()
    
    dummies_summary = summarizingResults(results, dummies_summary, i)

In [16]:
dummies_summary_path = os.path.join(DTPM_TRXDir,'5_RESULTS/1_SYSTEM/0_original/contemporary_dummies_summary.csv')
dummies_summary.to_csv(dummies_summary_path, sep=';',encoding='latin-1')

## Dataframe to summarize results for lagged analyses

In [None]:
lagged_summary = pd.DataFrame()

In [None]:
def summarizingLaggedResults(x, df, model_name):
    params = x.params.rename('params_' + model_name)
    t = x.tvalues.rename('t_' + model_name)
    p = x.pvalues.rename('p_' + model_name)

    df = pd.concat([df, params, t, p], axis=1)
    df.loc['R2_ADJ', 'params_' + model_name] = x.rsquared_adj
    df.loc['AIC', 'params_' + model_name] = x.aic
    df.loc['F', 'params_' + model_name] = x.fvalue
    df.loc['P_F', 'params_' + model_name] = x.f_pvalue
    df.loc['D_W', 'params_' + model_name] = sm.stats.stattools.durbin_watson(x.resid)
    
    return df

## Begin analyses for lagged variables...

* Selected model to analyse lagged variables is M5.

Model 11

In [None]:
complete_db['lag_ratio_tm'] = complete_db['ratio_tm'].shift(1)

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                        'SUNDAY',
                        'lag_ratio_tm',
                        'kms_ofertados',
                        'WEEK_OF_YEAR_52',
                        'WEEK_OF_YEAR_53',
                        'WEEK_OF_YEAR_1',
                        'WEEK_OF_YEAR_2',
                        'WEEK_OF_YEAR_3',
                        'WEEK_OF_YEAR_4',
                        'WEEK_OF_YEAR_5',
                        'WEEK_OF_YEAR_6',
                        'WEEK_OF_YEAR_7',
                        'WEEK_OF_YEAR_8',
                        'WEEK_OF_YEAR_9',
                        'Julio',
                        't',
                        'Feriado_laboral',
                        'Feriado_no_laboral',
                        'FDS_Largo',
                        'Disturbios',
                        'Clima',
                        'Partido',
                        'Censo',
                        'visperas_laborales',
                        'N_ZPs']]

X = sm.add_constant(X)
m_11 = sm.OLS(Y, X, missing='drop')
results_11 = m_11.fit()
#print(results_11.summary())
lagged_summary = summarizingResults(results_11, lagged_summary, 'm11')

Model 12

In [None]:
complete_db['lag_ratio_tm_2'] = complete_db['lag_ratio_tm'] * complete_db['lag_ratio_tm']

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'lag_ratio_tm',
                       'lag_ratio_tm_2',
                       'kms_ofertados',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'Partido',
                       'Censo',
                       'visperas_laborales',
                       'N_ZPs']]

X = sm.add_constant(X)
m_12 = sm.OLS(Y, X, missing='drop')
results_12 = m_12.fit()
#print(results_12.summary())
lagged_summary = summarizingResults(results_12, lagged_summary, 'm12')

* Model 13

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'lag_ratio_tm_2',
                       'kms_ofertados',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'Partido',
                       'Censo',
                       'visperas_laborales',
                       'N_ZPs']]

X = sm.add_constant(X)
m_13 = sm.OLS(Y, X, missing='drop')
results_13 = m_13.fit()
print(results_13.summary())
lagged_summary = summarizingResults(results_13, lagged_summary, 'm13')