## Time series estimation - Daily at UN level

% TODO: Check if R-squared are correctly computed

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

In [3]:
import statsmodels.api as sm

  from pandas.core import datetools


In [4]:
from Utils import TransantiagoConstants

In [5]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

#### Loading independent variables

In [6]:
independent_variables_path = os.path.join(DTPM_TRXDir,'0_INDEPENDENTS/independents_variables.csv')
independent_variables = pd.read_csv(independent_variables_path,sep=';',encoding='latin-1', index_col=0, parse_dates=[1])
independent_variables.loc[:,'Verano'] =  independent_variables.loc[:,'Enero'] + independent_variables.loc[:,'Febrero']
independent_variables.loc[:,'Nov_Dic_2017'] = independent_variables.loc[:,'Nov_2017'] + independent_variables.loc[:,'Dic_2017']
independent_variables.loc[:,'WEEK_OF_YEAR'] = independent_variables.loc[:,'DATE'].apply(lambda x: x.week)
independent_variables = pd.get_dummies(independent_variables, columns=['WEEK_OF_YEAR'])

#### Loading dependent variable for U6

In [7]:
UN = 'U6'
daily_input_path = os.path.join(DTPM_TRXDir,'4_DAILY_UN/' + UN + '/' + UN + '_summary.csv')
daily_trx = pd.read_csv(daily_input_path,sep=';',encoding='latin-1', index_col=0)

daily_trx.loc[:,'TOTAL_trx'] = daily_trx.loc[:,'pn_SUM_TRX_no_t'] + daily_trx.loc[:,'pn_SUM_TRX_3t'] + daily_trx.loc[:,'pn_SUM_TRX_tm'] + daily_trx.loc[:,'zp_SUM_TRX']

#### Merging and sorting for U6

In [8]:
complete_db = daily_trx.merge(independent_variables, on =['YEAR','MONTH','YEAR_DAY'], how='left')

In [9]:
complete_db.sort_values(by=['YEAR','MONTH','YEAR_DAY'], ascending=[True,True,True], inplace=True)

#### Estimation for U6

##### Trying models with U6

In [15]:
def summarizingResults(x, df, model_name):
    params = x.params.rename('params_' + model_name)
    t = x.tvalues.rename('t_' + model_name)
    p = x.pvalues.rename('p_' + model_name)

    df = pd.concat([df, params, t, p], axis=1)
    df.loc['R2_ADJ', 'params_' + model_name] = x.rsquared_adj
    df.loc['AIC', 'params_' + model_name] = x.aic
    df.loc['F', 'params_' + model_name] = x.fvalue
    df.loc['P_F', 'params_' + model_name] = x.f_pvalue
    df.loc['D_W', 'params_' + model_name] = sm.stats.stattools.durbin_watson(x.resid)
    
    return df

In [16]:
OLS_Summary_U6 = pd.DataFrame()

* U6_M1

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       'Nov_Dic_2017',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'Elecciones',
                       'Partido',
                       'FDS_Largo',
                       'Disturbios',
                       'Bucle',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

X = sm.add_constant(X)
m1 = sm.OLS(Y, X)
results_m1 = m1.fit()
print(results_m1.summary())

OLS_Summary_U6 = summarizingResults(results_m1, OLS_Summary_U6, 'U6_m1')

ypred_m1 = results_m1.predict(X)
complete_db.loc[:,'ypred_m1'] = ypred_m1

* U6_M2

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'Partido',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

X = sm.add_constant(X)
m2 = sm.OLS(Y, X)
results_m2 = m2.fit()
print(results_m2.summary())

OLS_Summary_U6 = summarizingResults(results_m2, OLS_Summary_U6, 'U6_m2')

ypred_m2 = results_m2.predict(X)
complete_db.loc[:,'ypred_m2'] = ypred_m2

* U6_M3

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

X = sm.add_constant(X)
m3 = sm.OLS(Y, X)
results_m3 = m3.fit()
print(results_m3.summary())

OLS_Summary_U6 = summarizingResults(results_m3, OLS_Summary_U6, 'U6_m3')

ypred_m3 = results_m3.predict(X)
complete_db.loc[:,'ypred_m3'] = ypred_m3

* U6_M4

In [None]:
complete_db.loc[:,'ratio_tm_2'] = complete_db.loc[:,'ratio_tm']*complete_db.loc[:,'ratio_tm']

In [None]:
Y = complete_db.loc[:,'TOTAL_trx']
X = complete_db.loc[:,['SATURDAY',
                       'SUNDAY',
                       'ratio_tm',
                       'ratio_tm_2',
                       'kms_ofertados_U6',
                       'WEEK_OF_YEAR_52',
                       'WEEK_OF_YEAR_53',
                       'WEEK_OF_YEAR_1',
                       'WEEK_OF_YEAR_2',
                       'WEEK_OF_YEAR_3',
                       'WEEK_OF_YEAR_4',
                       'WEEK_OF_YEAR_5',
                       'WEEK_OF_YEAR_6',
                       'WEEK_OF_YEAR_7',
                       'WEEK_OF_YEAR_8',
                       'WEEK_OF_YEAR_9',
                       'Julio',
                       't',
                       'Feriado_laboral',
                       'Feriado_no_laboral',
                       'Censo',
                       'FDS_Largo',
                       'Disturbios',
                       'Clima',
                       'visperas_laborales',
                       'N_ZPs_UN6']]

X = sm.add_constant(X)
m4 = sm.OLS(Y, X)
results_m4 = m4.fit()
print(results_m4.summary())

OLS_Summary_U6 = summarizingResults(results_m4, OLS_Summary_U6, 'U6_m4')

ypred_m4 = results_m4.predict(X)
complete_db.loc[:,'ypred_m4'] = ypred_m4

##### Trying models with dummies based on U6

* Summarizing thresholds

In [10]:
def summarizingResults(x, df, inferior, superior):
    params = x.params.rename('params_' + str(inferior) + ',' + str(superior)) #x.params is a pandas series. The rename call is to assign a name to the series.
    t = x.tvalues.rename('t_' + str(inferior) + ',' + str(superior)) #x.tvalues is a pandas series
    p = x.pvalues.rename('p_' + str(inferior) + ',' + str(superior)) #x.pvalues is a pandas series

    results = pd.concat([params, t, p], axis=1).T    
    df = pd.concat([df, results], axis=0)
    
    return df

In [11]:
umbrales_Summary_U6 = pd.DataFrame()

In [12]:
inferiores = range(10,21)
superiores = range(40,61)

In [13]:
for inferior in inferiores:
    for superior in superiores:
        
        complete_db.loc[:,'rtm_medio'] = np.where((inferior<=complete_db.loc[:,'ratio_tm'])&(complete_db.loc[:,'ratio_tm']<superior),1,0)
        complete_db.loc[:,'rtm_superior'] = np.where((superior<=complete_db.loc[:,'ratio_tm']),1,0)        

        complete_db.loc[:,'r_tm_x_rtm_medio'] = complete_db.loc[:,'ratio_tm'] * complete_db.loc[:,'rtm_medio']
        complete_db.loc[:,'r_tm_x_rtm_superior'] = complete_db.loc[:,'ratio_tm'] * complete_db.loc[:,'rtm_superior']
        
        Y = complete_db.loc[:,'TOTAL_trx']
        X = complete_db.loc[:,['SATURDAY',
                               'SUNDAY',
                               'ratio_tm',
                               'r_tm_x_rtm_medio',
                               'r_tm_x_rtm_superior',
                               'kms_ofertados_U6',
                               'WEEK_OF_YEAR_52',
                               'WEEK_OF_YEAR_53',
                               'WEEK_OF_YEAR_1',
                               'WEEK_OF_YEAR_2',
                               'WEEK_OF_YEAR_3',
                               'WEEK_OF_YEAR_4',
                               'WEEK_OF_YEAR_5',
                               'WEEK_OF_YEAR_6',
                               'WEEK_OF_YEAR_7',
                               'WEEK_OF_YEAR_8',
                               'WEEK_OF_YEAR_9',
                               'Julio',
                               't',
                               'Feriado_laboral',
                               'Feriado_no_laboral',
                               'Censo',
                               'FDS_Largo',
                               'Disturbios',
                               'Clima',
                               'visperas_laborales',
                               'N_ZPs_UN6']]
        
        X = sm.add_constant(X)
        m = sm.OLS(Y, X)
        results = m.fit()
    
        umbrales_Summary_U6 = summarizingResults(results, umbrales_Summary_U6, inferior, superior)        

In [14]:
#umbrales_Summary_U6_path = os.path.join(DTPM_TRXDir,'5_RESULTS/2_BY_UN/0_original/umbrales_Summary_U6.csv')
#umbrales_Summary_U6.to_csv(umbrales_Summary_U6_path, sep=';',encoding='latin-1')

--------------------