# Time series estimation at system level

1. Import datasets
2. Unit root testing.
3. Estimation and saving results
4. Summarizing and printing results to files

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import statsmodels.api as sm
import math

from Utils import TransantiagoConstants
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from pandas.core import datetools


In [2]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

* Importing datasets

In [3]:
daily_input_path = os.path.join(DTPM_TRXDir,'3_DAILY/daily_summary.csv')
daily_trx = pd.read_csv(daily_input_path,sep=';',encoding='latin-1', index_col=0)

In [4]:
daily_trx.loc[:,'TOTAL_trx'] = daily_trx.loc[:,'pn_SUM_TRX_no_t'] + daily_trx.loc[:,'pn_SUM_TRX_3t'] + daily_trx.loc[:,'pn_SUM_TRX_tm'] + daily_trx.loc[:,'zp_SUM_TRX']

In [5]:
independent_variables_path = os.path.join(DTPM_TRXDir,'0_INDEPENDENTS/independents_variables.csv')
independent_variables = pd.read_csv(independent_variables_path,sep=';',encoding='latin-1', index_col=0, parse_dates=[1])

In [6]:
independent_variables.loc[:,'Verano'] =  independent_variables.loc[:,'Enero'] + independent_variables.loc[:,'Febrero']
independent_variables.loc[:,'Nov_Dic_2017'] = independent_variables.loc[:,'Nov_2017'] + independent_variables.loc[:,'Dic_2017']
independent_variables.loc[:,'WEEK_OF_YEAR'] = independent_variables.loc[:,'DATE'].apply(lambda x: x.week) #x.week returns the iso-week attribute!
independent_variables = pd.get_dummies(independent_variables, columns=['WEEK_OF_YEAR'])

In [7]:
complete_db = daily_trx.merge(independent_variables, on =['YEAR','MONTH','YEAR_DAY'], how='left')

In [8]:
complete_db.sort_values(by=['YEAR','MONTH','YEAR_DAY'], ascending=[True,True,True], inplace=True)

In [9]:
complete_db.head()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,pn_SUM_TRX_no_t,pn_SUM_EXP_no_t,ratio_no_t,pn_SUM_TRX_3t,pn_SUM_EXP_3t,ratio_3t,pn_SUM_TRX_tm,...,WEEK_OF_YEAR_44,WEEK_OF_YEAR_45,WEEK_OF_YEAR_46,WEEK_OF_YEAR_47,WEEK_OF_YEAR_48,WEEK_OF_YEAR_49,WEEK_OF_YEAR_50,WEEK_OF_YEAR_51,WEEK_OF_YEAR_52,WEEK_OF_YEAR_53
0,2015,1,1,514892,30226,82.109095,116766,6586,17.890905,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,1,2,1604441,55015,80.459518,326616,13361,19.540482,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,1,3,1180428,45466,82.639911,247882,9551,17.360089,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2015,1,4,837263,36882,82.830642,172004,7645,17.169358,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,1,5,2211510,55316,81.14896,450375,12850,18.85104,0.0,...,0,0,0,0,0,0,0,0,0,0


* Unit root testing

The existence of unit root in a process is checked via the Augmented-Dickey-Fuller(ADF) test. <br>
This test makes use of the equation $\Delta y_t = \alpha + \theta y_{t-1} + \gamma_{1}\Delta y_{t-1} +  ... + \gamma_{p}\Delta y_{t-p} + e_{t}$, where $e_{t}$ is assumed to be a _martingale difference sequence_ (which implies that the model should be dynamically complete), and $\theta = \rho-1$. <br>
The test is <br>
H0 : $\theta = 0$ (i.e. there's evidence that unit roots exist) <br>
H1 : $\theta < 0$ (i.e. "the data do not provide strong evidence against H0" (Wooldridge, 2013)). <br>

H0 is rejected agains H1 if $t_{\theta}<c$, where $c$ is the critical value.

In [10]:
from statsmodels.tsa.stattools import adfuller

In [11]:
def applyADFTest(column, maxlag, regression, autolag):
    #Be aware that when autolag is specified (i.e not None), maxlag is not considered#
    x = complete_db.loc[:,column].values
    result = adfuller(x, maxlag = maxlag, regression = regression, autolag = autolag)
    print('ADF Statistic (t of theta): %f' % result[0])
    print('p-value: %f' % result[1])
    print('lags used: %f' % result[2])
    print('obs: %f' % result[3])
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))


In [12]:
applyADFTest('pn_SUM_EXP_no_t', 0, 'ct', None)

ADF Statistic (t of theta): -21.310225
p-value: 0.000000
lags used: 0.000000
obs: 1095.000000
	1%: -3.967
	10%: -3.129
	5%: -3.415


In [13]:
complete_db.loc[:,'pn_SUM_EXP_no_t - 1'] = complete_db.loc[:,'pn_SUM_EXP_no_t'].shift(periods=1)

In [14]:
Y = complete_db.loc[:,'pn_SUM_EXP_no_t'] - complete_db.loc[:,'pn_SUM_EXP_no_t - 1']

In [15]:
Y.dropna(inplace=True)

In [16]:
X = complete_db.loc[:,['pn_SUM_EXP_no_t - 1','t']].dropna()
X = sm.add_constant(X)

m = sm.OLS(Y, X)
results = m.fit()

results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.294
Model:,OLS,Adj. R-squared:,0.292
Method:,Least Squares,F-statistic:,227.1
Date:,"Tue, 19 Jun 2018",Prob (F-statistic):,3.37e-83
Time:,22:55:48,Log-Likelihood:,-11367.0
No. Observations:,1095,AIC:,22740.0
Df Residuals:,1092,BIC:,22760.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.416e+04,1666.008,20.504,0.000,3.09e+04,3.74e+04
pn_SUM_EXP_no_t - 1,-0.5846,0.027,-21.310,0.000,-0.638,-0.531
t,-10.8337,0.897,-12.073,0.000,-12.594,-9.073

0,1,2,3
Omnibus:,71.888,Durbin-Watson:,1.777
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.742
Skew:,-0.217,Prob(JB):,7.76e-08
Kurtosis:,2.273,Cond. No.,348000.0
