## Generating independent variables set by year, month, year_date

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
#%matplotlib qt5
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

In [3]:
from Utils import TransantiagoConstants

In [4]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

In [5]:
independent_variables = pd.DataFrame()
start_date = pd.to_datetime('2015-01-01').date()
end_date = pd.to_datetime('2017-12-31').date()
independent_variables['DATE'] = pd.date_range(start=start_date, end=end_date, freq='D')

In [6]:
independent_variables['YEAR'] = independent_variables['DATE'].dt.year
independent_variables['MONTH'] = independent_variables['DATE'].dt.month

In [7]:
def year_day_calc(x):
    if(x[1]==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[1]==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[0].date() - new_year_day).days + 1)

In [8]:
def saturday(x):
    if(x.weekday()==5):
        return 1
    else:
        return 0

In [9]:
def sunday(x):
    if(x.weekday()==6):
        return 1
    else:
        return 0

In [10]:
independent_variables['YEAR_DAY'] = independent_variables.apply(year_day_calc,axis=1)
independent_variables['SATURDAY'] = independent_variables['DATE'].apply(saturday)
independent_variables['SUNDAY'] = independent_variables['DATE'].apply(sunday)

###### Appending fare. Test based on dataset length is <font color='green'> passed </font>

In [11]:
fares_path = os.path.join(DTPMDir,'08_Tarifas/Tarifas_2007_2017.xlsx')
fares_df = pd.read_excel(fares_path) #dates are already parsed

In [12]:
fares_df.loc[:,'YEAR'] = fares_df.loc[:,'Mes'].dt.year
fares_df.loc[:,'MONTH'] = fares_df.loc[:,'Mes'].dt.month

In [13]:
print('Length of independent_variables before 1st-merge is: ' + str(len(independent_variables.index)))

Length of independent_variables before 1st-merge is: 1096


In [14]:
independent_variables = independent_variables.merge(fares_df,how='left',left_on=['YEAR','MONTH'],right_on=['YEAR','MONTH'])

In [15]:
print('Length of independent_variables after 1st-merge is: ' + str(len(independent_variables.index)))

Length of independent_variables after 1st-merge is: 1096


##### Appending kms_comerciales validamente ofertados at system level. Test based on dataset length is <font color='green'> passed </font>

In [16]:
kms_path = os.path.join(DTPMDir,'06_LBS/1_consolidados/kms_recorridos.txt')
kms_df = pd.read_table(kms_path,sep=';', encoding='latin-1',index_col = 0) #Dates are not parsed. m_ofertados are not parsed.

In [17]:
kms_df['Fecha'] = kms_df['Fecha'].apply(lambda x: pd.to_datetime(x, format='%d/%m/%Y'))

In [18]:
def year_day_calc(x):
    if(x[1].year==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[1].year==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[1].date() - new_year_day).days + 1)

In [19]:
kms_df['YEAR_DAY'] = kms_df.apply(year_day_calc, axis=1)    
kms_df['MONTH'] = kms_df['Fecha'].dt.month
kms_df['YEAR'] = kms_df['Fecha'].dt.year

In [20]:
kms_df['m_ofertados'] = kms_df['m_ofertados'].apply(lambda x: x.replace(',','.'))
kms_df['m_ofertados'] = kms_df['m_ofertados'].apply(lambda x: float(x))

In [21]:
grouped_kms_df = kms_df.groupby(['YEAR','MONTH','YEAR_DAY'])['m_ofertados'].sum().to_frame().reset_index()

In [22]:
independent_variables = independent_variables.merge(grouped_kms_df,how='left',on=['YEAR','MONTH','YEAR_DAY'])

In [23]:
print('Length of independent_variables after 2nd-merge is: ' + str(len(independent_variables.index)))

Length of independent_variables after 2nd-merge is: 1096


In [24]:
independent_variables['kms_ofertados'] = independent_variables['m_ofertados'].apply(lambda x: x/1000)

##### Appending kms_comerciales validamente ofertados at UN level. Test based on dataset length is <font color='green'> passed </font>

In [25]:
grouped_kms_UN_df = kms_df.groupby(['Fecha','UN'])['m_ofertados'].sum().to_frame().reset_index()

In [26]:
grouped_kms_UN_df = grouped_kms_UN_df.pivot(index='Fecha',columns='UN', values='m_ofertados').reset_index()
grouped_kms_UN_df.rename(columns={'U1':'m_U1', 'U2':'m_U2', 'U3':'m_U3', 'U4':'m_U4', 'U5':'m_U5', 'U6':'m_U6', 'U7':'m_U7'}, inplace=True)

In [27]:
def year_day_calc_2(x):
    if(x[0].year==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[0].year==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[0].date() - new_year_day).days + 1)

In [28]:
grouped_kms_UN_df['YEAR_DAY'] = grouped_kms_UN_df.apply(year_day_calc_2, axis=1)    
grouped_kms_UN_df['MONTH'] = grouped_kms_UN_df['Fecha'].dt.month
grouped_kms_UN_df['YEAR'] = grouped_kms_UN_df['Fecha'].dt.year

In [29]:
independent_variables = independent_variables.merge(grouped_kms_UN_df.drop('Fecha', axis=1),how='left',on=['YEAR','MONTH','YEAR_DAY'])

In [30]:
print('Length of independent_variables after 3rd-merge is: ' + str(len(independent_variables.index)))

Length of independent_variables after 3rd-merge is: 1096


In [31]:
independent_variables['kms_ofertados_U1'] = independent_variables['m_U1'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U2'] = independent_variables['m_U2'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U3'] = independent_variables['m_U3'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U4'] = independent_variables['m_U4'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U5'] = independent_variables['m_U5'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U6'] = independent_variables['m_U6'].apply(lambda x: x/1000)
independent_variables['kms_ofertados_U7'] = independent_variables['m_U7'].apply(lambda x: x/1000)

###### Creating dummy variables for Enero, Febrero and Julio => Estival and Invierno

In [32]:
independent_variables['Enero'] = independent_variables['MONTH'].apply(lambda x: 1 if x==1 else 0)
independent_variables['Febrero'] = independent_variables['MONTH'].apply(lambda x: 1 if x==2 else 0)
independent_variables['Julio'] = independent_variables['MONTH'].apply(lambda x: 1 if x==7 else 0)

###### Creating dummy variables for Noviembre and Diciembre 2017, since implementation of L6 was made during these months.

In [33]:
def dummy_noviembre_2017(x):
    if((x[1]==2017)&(x[2]==11)):
        return 1
    else:
        return 0

def dummy_diciembre_2017(x):
    if((x[1]==2017)&(x[2]==12)):
        return 1
    else:
        return 0

independent_variables['Nov_2017'] = independent_variables.apply(dummy_noviembre_2017, axis=1)
independent_variables['Dic_2017'] = independent_variables.apply(dummy_diciembre_2017, axis=1)

###### Creating temporal variable

In [34]:
def temporal_variable(x):
    if(x[1]==2015):
        return x[3]
    elif(x[1]==2016):
        return 365+x[3]
    else:
        return 365+366+x[3]

In [35]:
independent_variables['t'] = independent_variables.apply(temporal_variable, axis=1)

###### Creating dummy variables per type of special days. Test based on dataset length is <font color='green'> passed </font>

In [36]:
DES_path = os.path.join(DTPMDir,'07_DES/resumen_des.xlsx')
DES_df = pd.read_excel(DES_path) #Dates are already parsed.

In [37]:
def grouping_DES(x):
    suma_dummies = x.sum(skipna=False) #Sums with NaNs return NaN. Day descriptions are omitted.
    if(suma_dummies>1):
        return 1
    else:
        return suma_dummies

In [38]:
grouped_DES_df = DES_df.groupby(['Fecha']).agg(grouping_DES).reset_index()

In [39]:
grouped_DES_df.head()

Unnamed: 0,Fecha,Feriado_laboral,Feriado_no_laboral,Elecciones,Censo,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
0,2015-01-01,1,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
1,2015-01-02,1,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
2,2015-01-03,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
3,2015-01-04,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
4,2015-01-14,0,0,0,0,0,0,0,1.0,0.0,0.0,1.0,0,0


In [40]:
def year_day_calc(x):
    if(x[0].year==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[0].year==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[0].date() - new_year_day).days + 1)

In [41]:
grouped_DES_df['YEAR_DAY'] = grouped_DES_df.apply(year_day_calc, axis=1)    
grouped_DES_df['MONTH'] = grouped_DES_df['Fecha'].dt.month
grouped_DES_df['YEAR'] = grouped_DES_df['Fecha'].dt.year

In [42]:
independent_variables = independent_variables.merge(grouped_DES_df.drop('Fecha', axis=1),how='left',on=['YEAR','MONTH','YEAR_DAY'])

In [43]:
print('Length of independent_variables after 3rd-merge is: ' + str(len(independent_variables.index)))

Length of independent_variables after 3rd-merge is: 1096


In [44]:
independent_variables = independent_variables.fillna(0) #Assuming 0 values in NaNs obtained via f(x)

##### Creating metro-kms, metro-estaciones and escalon as independent variables

In [45]:
extension_metro = pd.to_datetime('2017-11-03').date()

In [46]:
independent_variables['kms_metro'] = np.where(independent_variables['DATE'] <  extension_metro, 103, 118)
independent_variables['estaciones_metro'] = np.where(independent_variables['DATE'] <  extension_metro, 108, 118)
independent_variables['escalon_metro'] = np.where(independent_variables['DATE'] <  extension_metro, 0, 1)

##### Computing number of ZPs per day - system

In [47]:
ZP_path = os.path.join(DTPMDir,'05_ZP/2_Modificado/zonas_pagas_resumen_UN.csv')
ZP_df = pd.read_csv(ZP_path, encoding='latin-1') #Dates are not parsed.

In [48]:
ZP_df.loc[:,'Inicio de Operación'] = ZP_df['Inicio de Operación'].apply(lambda x: pd.to_datetime(x, format='%d/%m/%Y').date())

In [49]:
ZP_df.loc[ZP_df['Fin de Operación']=='Activa','Fin de Operación'] = np.NaN

In [50]:
ZP_df.loc[:,'Fin de Operación'] = ZP_df['Fin de Operación'].apply(lambda x: pd.to_datetime(x, format='%d/%m/%Y').date())

In [51]:
def countingZps(x):
    sabado = independent_variables.loc[independent_variables['DATE'] == x, 'SATURDAY'].item()
    domingo = independent_variables.loc[independent_variables['DATE'] == x, 'SUNDAY'].item()
    feriado_laboral = independent_variables.loc[independent_variables['DATE'] == x, 'Feriado_laboral'].item()
      
    if (sabado==0) & (domingo==0):
        if (feriado_laboral==0):
            opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_LABORAL'] == 1),:].index)
            closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_LABORAL'] == 1),:].index)
        else:
            opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1),:].index)
            closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1),:].index)
    
    elif sabado==1:
        opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_SABADO'] == 1),:].index)
        closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_SABADO'] == 1),:].index)    
    
    elif domingo==1:
        opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1),:].index)
        closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1),:].index)       
    

    total = opened-closed
    
    return total

In [52]:
independent_variables.loc[:,'N_ZPs'] = independent_variables.loc[:,'DATE'].apply(countingZps)

In [53]:
def countingZPsByUN(x, UN):
    sabado = independent_variables.loc[independent_variables['DATE'] == x, 'SATURDAY'].item()
    domingo = independent_variables.loc[independent_variables['DATE'] == x, 'SUNDAY'].item()
    feriado_laboral = independent_variables.loc[independent_variables['DATE'] == x, 'Feriado_laboral'].item()
      
    if (sabado==0) & (domingo==0):
        if (feriado_laboral==0):
            opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_LABORAL'] == 1) &
                                   ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
            
            closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_LABORAL'] == 1) &
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
        else:
            opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1) & 
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
            
            closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1) &
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
    
    elif sabado==1:
        opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_SABADO'] == 1) &
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
        
        closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_SABADO'] == 1) &
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)    
    
    elif domingo==1:
        opened = len(ZP_df.loc[(ZP_df['Inicio de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1) &
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)
        
        closed = len(ZP_df.loc[(ZP_df['Fin de Operación']<x.date()) & (ZP_df['SI_DOMINGO'] == 1) & 
                                    ((ZP_df['UN Principal']==UN)|
                                    (ZP_df['UN Secundaria 1']==UN)|
                                    (ZP_df['UN Secundaria 2']==UN)|
                                    (ZP_df['UN Secundaria 3']==UN)|
                                    (ZP_df['UN Secundaria 4']==UN)|
                                    (ZP_df['UN Secundaria 5']==UN)),:].index)       
    

    total = opened-closed
    
    return total

In [54]:
independent_variables.loc[:,'N_ZPs_UN1'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U1'))
independent_variables.loc[:,'N_ZPs_UN2'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U2'))
independent_variables.loc[:,'N_ZPs_UN3'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U3'))
independent_variables.loc[:,'N_ZPs_UN4'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U4'))
independent_variables.loc[:,'N_ZPs_UN5'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U5'))
independent_variables.loc[:,'N_ZPs_UN6'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U6'))
independent_variables.loc[:,'N_ZPs_UN7'] = independent_variables.loc[:,'DATE'].apply(lambda x: countingZPsByUN(x, 'U7'))

* Plotting number of zps by time. Not always needed

In [55]:
#import plotly.plotly
#import plotly.graph_objs as go

In [56]:
#days_values = independent_variables.loc[:,'DATE']
#zps_values = independent_variables.loc[:,'N_ZPs']

In [57]:
#trace1 = go.Scatter(x=days_values, y=zps_values, name='Total ZPs', marker=dict(color='rgb(76, 181, 245)'),opacity=1)
#data = [trace1]

#layout = go.Layout(title='Evolución total zonas pagas en el tiempo',  yaxis=dict(title='Total ZPs'))

#fig = go.Figure(data=data, layout=layout)
#plotly.offline.plot(fig, filename='ZPs.html')

##### Printing everything to a file

In [58]:
independent_variables_output_path = os.path.join(DTPM_TRXDir, '0_INDEPENDENTS/independents_variables.csv')

In [59]:
independent_variables.to_csv(independent_variables_output_path,sep=';',encoding='latin-1')

# Closed