### Time series estimation

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

In [3]:
from Utils import TransantiagoConstants

In [4]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

In [5]:
def reOrderingDataFrames():
    summary = pd.DataFrame() 
    pn_input_path = os.path.join(DTPM_TRXDir, '1_DAILY_SUMMARY/daily_pn_summary.csv')
    zp_input_path = os.path.join(DTPM_TRXDir, '1_DAILY_SUMMARY/daily_zp_summary.csv')
    
    pn_summary = pd.read_csv(pn_input_path, sep=';', encoding = 'latin-1', index_col = 0)
    zp_summary = pd.read_csv(zp_input_path, sep=';', encoding = 'latin-1', index_col = 0)
    
    pn_summary['DATE'] = pn_summary['DATE'].apply(lambda x: pd.to_datetime(x).date())
    zp_summary['DATE'] = zp_summary['DATE'].apply(lambda x: pd.to_datetime(x).date())
    
    no_turnstile = pn_summary.loc[(pn_summary['torniquete_mariposa']==0)
                                  &(pn_summary['no_torniquete']==1),:]

    three_turnstile = pn_summary.loc[(pn_summary['torniquete_mariposa']==0)
                                     &(pn_summary['no_torniquete']==0),:]

    butterfly_turnstile = pn_summary.loc[(pn_summary['torniquete_mariposa']==1)
                                         &(pn_summary['no_torniquete']==0),:]
    
    summary = no_turnstile.loc[:,['YEAR_DAY', 'UN', 'YEAR', 'MONTH', 'SATURDAY', 'SUNDAY', 'DATE','pn_SUM_TRX', 'pn_SUM_EXP',
                                  'ratio', 'Buses', 'Estudiantes Ed. Media/Superior', 'm_ofertados', 'kms_ofertados',
                                  'Enero', 'Febrero', 'Julio', 'Nov_2017', 'Dic_2017','t', 'Feriado_laboral', 'Feriado_no_laboral',
                                  'Censo_Elecciones', 'Partido', 'FDS_Largo', 'Disturbios', 'Corte_Metro', 'Retraso_Metro',
                                  'Incidente_Metro', 'Bucle', 'Clima','visperas_laborales']]
    #First merge...
    three_turnstile = three_turnstile.merge(no_turnstile.loc[:,['YEAR','MONTH','YEAR_DAY']], how='outer', on = ['YEAR','MONTH','YEAR_DAY'])
    three_turnstile = three_turnstile.fillna(0)
    summary = summary.merge(three_turnstile.loc[:,['YEAR','MONTH','YEAR_DAY','pn_SUM_TRX','pn_SUM_EXP','ratio']], how='left', on=['YEAR','MONTH','YEAR_DAY'], suffixes=('_no_t','_3t'))
    
    #Second merge...
    butterfly_turnstile = butterfly_turnstile.merge(no_turnstile.loc[:,['YEAR','MONTH','YEAR_DAY']], how='outer', on = ['YEAR','MONTH','YEAR_DAY'])
    butterfly_turnstile = butterfly_turnstile.fillna(0)
    summary = summary.merge(butterfly_turnstile.loc[:,['YEAR','MONTH','YEAR_DAY','pn_SUM_TRX','pn_SUM_EXP','ratio']], how='left', on=['YEAR','MONTH','YEAR_DAY'])
    
    #Third merge...
    zp_summary = zp_summary.merge(no_turnstile.loc[:,['YEAR','MONTH','YEAR_DAY']], how='outer', on = ['YEAR','MONTH','YEAR_DAY'])
    zp_summary = zp_summary.fillna(0)
    summary = summary.merge(zp_summary.loc[:,['YEAR','MONTH','YEAR_DAY','zp_SUM_TRX','zp_SUM_TRX_NO_VALIDAS']], how='left', on=['YEAR','MONTH','YEAR_DAY'])
    
    #Renaming and final sorting...
    summary.rename(columns={"pn_SUM_TRX": "pn_SUM_TRX_tm", "pn_SUM_EXP": "pn_SUM_EXP_tm", "ratio":"ratio_tm"}, inplace=True)
    summary.sort_values(by=['YEAR','MONTH','YEAR_DAY'], ascending=[True, True, True], inplace=True)
    
    #Returning
    return summary

### Estimation

In [6]:
import statsmodels.api as sm

  from pandas.core import datetools


In [None]:
#summary = reOrderingDataFrames()

#summary_output_path = os.path.join(DTPM_TRXDir, '1_DAILY_SUMMARY/daily_summary.csv')

#summary.to_csv(summary_output_path, sep=";", encoding="latin-1")

In [7]:
summary_input_path = os.path.join(DTPM_TRXDir, '1_DAILY_SUMMARY/daily_summary.csv')
summary = pd.read_csv(summary_input_path, sep=";", encoding="latin-1")

In [8]:
no_turnstile_trx = summary.loc[:,'pn_SUM_TRX_no_t'].values
three_turnstile_trx = summary.loc[:,'pn_SUM_TRX_3t'].values
butterfly_turnstile_trx = summary.loc[:,'pn_SUM_TRX_tm'].values
zp_trx = summary.loc[:,'zp_SUM_TRX'].values

In [9]:
total_trx = [sum(x) for x in zip(no_turnstile_trx, three_turnstile_trx, butterfly_turnstile_trx, zp_trx)]

In [10]:
variables_independientes = summary.loc[:, ['SATURDAY', 'SUNDAY', 'ratio_tm', 'Buses', 'Estudiantes Ed. Media/Superior', 
                                                'kms_ofertados', 'Enero', 'Febrero', 'Julio', 'Nov_2017', 'Dic_2017',
                                                't', 'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones', 'Partido',
                                                'FDS_Largo', 'Disturbios', 'Corte_Metro', 'Retraso_Metro', 'Incidente_Metro',
                                                'Bucle', 'Clima', 'visperas_laborales']]

In [11]:
variables_independientes.loc[:,'Verano'] =  variables_independientes.loc[:,'Enero'] + variables_independientes.loc[:,'Febrero']

In [12]:
variables_independientes.loc[:,'Nov_Dic_2017'] = variables_independientes.loc[:,'Nov_2017'] + variables_independientes.loc[:,'Dic_2017']

### FIRST MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'kms_ofertados',<br>
 'Verano',<br>
 'Julio',<br>
 'Nov_Dic_2017',<br>
 't',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Censo_Elecciones',<br>
 'Partido',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Bucle',<br>
 'Clima',<br>
 'visperas_laborales'

In [13]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm', 'kms_ofertados', 'Verano', 'Julio', 'Nov_Dic_2017',
                                                't', 'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones', 'Partido',
                                                'FDS_Largo', 'Disturbios','Bucle', 'Clima', 'visperas_laborales']]

In [14]:
X = sm.add_constant(X)

In [15]:
model = sm.OLS(Y, X)

In [17]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.951
Method:                 Least Squares   F-statistic:                     1255.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:25:03   Log-Likelihood:                -14842.
No. Observations:                1096   AIC:                         2.972e+04
Df Residuals:                    1078   BIC:                         2.981e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               3.049e+06   3.33

### SECOND MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'kms_ofertados',<br>
 'Verano',<br>
 'Julio',<br>
 't',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Censo_Elecciones',<br>
 'Partido',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Clima',<br>
 'visperas_laborales'<br> 
<strong>Omitidas</strong>: <br>
 'Nov_Dic_2017',<br>
 'Bucle'

In [18]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm', 'kms_ofertados', 'Verano', 'Julio',
                                                't', 'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones', 'Partido',
                                                'FDS_Largo', 'Disturbios', 'Clima', 'visperas_laborales']]

In [19]:
X = sm.add_constant(X)

In [20]:
model = sm.OLS(Y, X)

In [21]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.951
Method:                 Least Squares   F-statistic:                     1423.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:25:12   Log-Likelihood:                -14843.
No. Observations:                1096   AIC:                         2.972e+04
Df Residuals:                    1080   BIC:                         2.980e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               3.053e+06   3.31

### THIRD MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'kms_ofertados',<br>
 'Verano',<br>
 'Julio',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Censo_Elecciones',<br>
 'Partido',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Clima',<br>
 'visperas_laborales'<br> 
<strong>Omitidas</strong>: <br>
 't',<br>
 'Nov_Dic_2017',<br>
 'Bucle'

In [23]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm', 'kms_ofertados', 'Verano', 'Julio',
                                                'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones', 'Partido',
                                                'FDS_Largo', 'Disturbios', 'Clima', 'visperas_laborales']]

In [24]:
X = sm.add_constant(X)

In [25]:
model = sm.OLS(Y, X)

In [26]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.946
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     1345.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:27:00   Log-Likelihood:                -14909.
No. Observations:                1096   AIC:                         2.985e+04
Df Residuals:                    1081   BIC:                         2.992e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               2.832e+06   2.88

### FOURTH MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'kms_ofertados',<br>
 'Verano',<br>
 'Julio',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Censo_Elecciones',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Clima',<br>
 'visperas_laborales'<br> 
<strong>Omitidas</strong>: <br>
 't',<br>
 'Nov_Dic_2017',<br>
 'Partido',<br>
 'Bucle'

In [27]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm', 'kms_ofertados', 'Verano', 'Julio',
                                                'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones',
                                                'FDS_Largo', 'Disturbios', 'Clima', 'visperas_laborales']]

In [28]:
X = sm.add_constant(X)

In [29]:
model = sm.OLS(Y, X)

In [30]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.946
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     1446.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:27:01   Log-Likelihood:                -14910.
No. Observations:                1096   AIC:                         2.985e+04
Df Residuals:                    1082   BIC:                         2.992e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                2.83e+06   2.88

### FIFTH MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'Verano',<br>
 'Julio',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Censo_Elecciones',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Clima',<br>
 'visperas_laborales'<br> 
<strong>Omitidas</strong>: <br>
 't',<br>
 'kms_ofertados',<br>
 'Nov_Dic_2017',<br>
 'Partido',<br>
 'Bucle'

In [31]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm','Verano', 'Julio',
                                                'Feriado_laboral', 'Feriado_no_laboral', 'Censo_Elecciones',
                                                'FDS_Largo', 'Disturbios', 'Clima', 'visperas_laborales']]

In [32]:
X = sm.add_constant(X)

In [33]:
model = sm.OLS(Y, X)

In [34]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.942
Model:                            OLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     1460.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:27:03   Log-Likelihood:                -14947.
No. Observations:                1096   AIC:                         2.992e+04
Df Residuals:                    1083   BIC:                         2.998e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               3.067e+06   9721

### SIXTH MODEL

Estimado por OLS.
<br>
<strong>Y</strong>:<br>
 'total de transacciones'
<br>
<strong>X</strong>: <br>
'SATURDAY',<br>
 'SUNDAY',<br>
 'ratio',<br>
 'kms_ofertados',<br>
 'Verano',<br>
 'Julio',<br>
 't',<br>
 'Feriado_laboral',<br>
 'Feriado_no_laboral',<br>
 'Partido',<br>
 'FDS_Largo',<br>
 'Disturbios',<br>
 'Clima',<br>
 'visperas_laborales'<br> 
<strong>Omitidas</strong>: <br>
 'Nov_Dic_2017',<br>
 'Bucle',<br>
 'Censo_Elecciones'

In [35]:
Y = total_trx
X = variables_independientes.loc[:,['SATURDAY', 'SUNDAY', 'ratio_tm', 'kms_ofertados', 'Verano', 'Julio',
                                                't', 'Feriado_laboral', 'Feriado_no_laboral', 'Partido',
                                                'FDS_Largo', 'Disturbios', 'Clima', 'visperas_laborales']]

In [36]:
X = sm.add_constant(X)

In [37]:
model = sm.OLS(Y, X)

In [38]:
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.951
Method:                 Least Squares   F-statistic:                     1521.
Date:                Thu, 01 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:28:53   Log-Likelihood:                -14845.
No. Observations:                1096   AIC:                         2.972e+04
Df Residuals:                    1081   BIC:                         2.979e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               3.053e+06   3.31

In [40]:
print(results.summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}       &        y         & \textbf{  R-squared:         } &     0.952   \\
\textbf{Model:}               &       OLS        & \textbf{  Adj. R-squared:    } &     0.951   \\
\textbf{Method:}              &  Least Squares   & \textbf{  F-statistic:       } &     1521.   \\
\textbf{Date:}                & Thu, 01 Mar 2018 & \textbf{  Prob (F-statistic):} &     0.00    \\
\textbf{Time:}                &     15:31:55     & \textbf{  Log-Likelihood:    } &   -14845.   \\
\textbf{No. Observations:}    &        1096      & \textbf{  AIC:               } & 2.972e+04   \\
\textbf{Df Residuals:}        &        1081      & \textbf{  BIC:               } & 2.979e+04   \\
\textbf{Df Model:}            &          14      & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                              & \textbf{coef} & \textbf{std err} & \textbf{t} & \textbf{P$>$$|$t$|$} & \textbf{