## Time series estimation - Daily at UN level

% TODO: Check if R-squared are correctly computed

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

In [3]:
import statsmodels.api as sm

  from pandas.core import datetools


In [4]:
from Utils import TransantiagoConstants

In [5]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

#### Loading independent variables

In [6]:
independent_variables_path = os.path.join(DTPM_TRXDir,'0_INDEPENDENTS/independents_variables.csv')
independent_variables = pd.read_csv(independent_variables_path,sep=';',encoding='latin-1', index_col=0, parse_dates=[1])
independent_variables.loc[:,'Verano'] =  independent_variables.loc[:,'Enero'] + independent_variables.loc[:,'Febrero']
independent_variables.loc[:,'Nov_Dic_2017'] = independent_variables.loc[:,'Nov_2017'] + independent_variables.loc[:,'Dic_2017']
independent_variables.loc[:,'WEEK_OF_YEAR'] = independent_variables.loc[:,'DATE'].apply(lambda x: x.week)
independent_variables = pd.get_dummies(independent_variables, columns=['WEEK_OF_YEAR'])

#### Loading dependent variable for U6

In [7]:
UN = 'U6'
daily_input_path = os.path.join(DTPM_TRXDir,'4_DAILY_UN/' + UN + '/' + UN + '_summary.csv')
daily_trx = pd.read_csv(daily_input_path,sep=';',encoding='latin-1', index_col=0)

daily_trx.loc[:,'TOTAL_trx'] = daily_trx.loc[:,'pn_SUM_TRX_no_t'] + daily_trx.loc[:,'pn_SUM_TRX_3t'] + daily_trx.loc[:,'pn_SUM_TRX_tm'] + daily_trx.loc[:,'zp_SUM_TRX']

#### Merging and sorting for U6

In [8]:
complete_db = daily_trx.merge(independent_variables, on =['YEAR','MONTH','YEAR_DAY'], how='left')

In [9]:
complete_db.sort_values(by=['YEAR','MONTH','YEAR_DAY'], ascending=[True,True,True], inplace=True)

#### Estimation for U6

* Summarizing everything

In [10]:
def summarizingResults(x, df, model_name):
    params = x.params.rename('params_' + model_name)
    t = x.tvalues.rename('t_' + model_name)
    p = x.pvalues.rename('p_' + model_name)

    df = pd.concat([df, params, t, p], axis=1)
    df.loc['R2_ADJ', 'params_' + model_name] = x.rsquared_adj
    df.loc['AIC', 'params_' + model_name] = x.aic
    df.loc['F', 'params_' + model_name] = x.fvalue
    df.loc['P_F', 'params_' + model_name] = x.f_pvalue
    df.loc['D_W', 'params_' + model_name] = sm.stats.stattools.durbin_watson(x.resid)
    
    return df

In [11]:
OLS_Summary_U6 = pd.DataFrame()

* Trying models based on U6

Model 1

In [12]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'Verano',
                       'Julio',
                       'Nov_Dic_2017',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'Elecciones',
                       'Partido',
                       'FDS_Largo',
                       'Disturbios',
                       'Bucle',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

In [13]:
X = sm.add_constant(X)
m1 = sm.OLS(Y, X)
results_m1 = m1.fit()
print(results_m1.summary())

                            OLS Regression Results                            
Dep. Variable:              TOTAL_trx   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     1094.
Date:                Thu, 22 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:01:03   Log-Likelihood:                -12335.
No. Observations:                1096   AIC:                         2.471e+04
Df Residuals:                    1076   BIC:                         2.481e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               2.731e+05   5386

In [14]:
OLS_Summary_U6 = summarizingResults(results_m1, OLS_Summary_U6, 'U6_m1')

In [15]:
ypred_m1 = results_m1.predict(X)
complete_db.loc[:,'ypred_m1'] = ypred_m1

--------------------

Model 2

In [16]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'Verano',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'Partido',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

In [17]:
X = sm.add_constant(X)
m2 = sm.OLS(Y, X)
results_m2 = m2.fit()
print(results_m2.summary())

                            OLS Regression Results                            
Dep. Variable:              TOTAL_trx   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     1301.
Date:                Thu, 22 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:03:25   Log-Likelihood:                -12336.
No. Observations:                1096   AIC:                         2.471e+04
Df Residuals:                    1079   BIC:                         2.479e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               2.722e+05   5193

In [18]:
OLS_Summary_U6 = summarizingResults(results_m2, OLS_Summary_U6, 'U6_m2')

In [19]:
ypred_m2 = results_m2.predict(X)
complete_db.loc[:,'ypred_m2'] = ypred_m2

------------------

Model 3

In [20]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'Verano',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

In [21]:
X = sm.add_constant(X)
m3 = sm.OLS(Y, X)
results_m3 = m3.fit()
print(results_m3.summary())

                            OLS Regression Results                            
Dep. Variable:              TOTAL_trx   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     1388.
Date:                Thu, 22 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:06:23   Log-Likelihood:                -12336.
No. Observations:                1096   AIC:                         2.470e+04
Df Residuals:                    1080   BIC:                         2.478e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                2.72e+05   5192

In [22]:
OLS_Summary_U6 = summarizingResults(results_m3, OLS_Summary_U6, 'U6_m3')

In [23]:
ypred_m3 = results_m3.predict(X)
complete_db.loc[:,'ypred_m3'] = ypred_m3

Model 4

In [24]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

In [25]:
X = sm.add_constant(X)
m4 = sm.OLS(Y, X)
results_m4 = m4.fit()
print(results_m4.summary())

                            OLS Regression Results                            
Dep. Variable:              TOTAL_trx   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     981.3
Date:                Thu, 22 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:12:47   Log-Likelihood:                -12245.
No. Observations:                1096   AIC:                         2.454e+04
Df Residuals:                    1070   BIC:                         2.467e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                2.72e+05   4825

In [26]:
OLS_Summary_U6 = summarizingResults(results_m4, OLS_Summary_U6, 'U6_m4')

In [27]:
ypred_m4 = results_m4.predict(X)
complete_db.loc[:,'ypred_m4'] = ypred_m4

Model 5

In [28]:
complete_db.loc[:,'ratio_tm_2'] = complete_db.loc[:,'ratio_tm']*complete_db.loc[:,'ratio_tm']

In [29]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'ratio_tm_2',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

In [30]:
X = sm.add_constant(X)
m5 = sm.OLS(Y, X)
results_m5 = m5.fit()
print(results_m5.summary())

                            OLS Regression Results                            
Dep. Variable:              TOTAL_trx   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     944.0
Date:                Thu, 22 Mar 2018   Prob (F-statistic):               0.00
Time:                        15:21:57   Log-Likelihood:                -12245.
No. Observations:                1096   AIC:                         2.454e+04
Df Residuals:                    1069   BIC:                         2.468e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 2.7e+05   5085

In [31]:
complete_db.loc[:,'ratio_tm'].max()

75.483692647871749