### Times series processing - by day by un

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
#%matplotlib qt5
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

In [3]:
from Utils import TransantiagoConstants

In [4]:
DTPMDir = TransantiagoConstants.DTPMDir
DTPM_TRXDir = TransantiagoConstants.DTPM_TRXDir

In [5]:
from Utils import ReadTurnstilesDataBase
[ana_turnstiles_df, mauricio_turnstiles_df] = ReadTurnstilesDataBase.readTurnstileData()
ana_turnstiles_df = ReadTurnstilesDataBase.processAnaTurnstiles(ana_turnstiles_df)

In [6]:
def saturday(x):
    if(x.weekday()==5):
        return 1
    else:
        return 0

In [7]:
def sunday(x):
    if(x.weekday()==6):
        return 1
    else:
        return 0

##### Creating function to analyze normal-stops by years

In [8]:
def year_pn_analyses_trx(year,ana_turnstiles_df,mauricio_turnstiles_df):
    path = os.path.join(DTPM_TRXDir,'un_ppu_sersen_sumtrx_' + str(year) + '_by_date.csv')
    df = pd.read_csv(path, sep=";", header=None, encoding='latin-1', usecols=[0,1,2,3,4,5], parse_dates=[3])
    df.columns = ['UN','PPU','SER_SEN','DATE','SUM_TRX','COUNT']
    print('Number of observations before merge info. of turnstile is: ' + str(len(df.index)))
    
    df = df.merge(ana_turnstiles_df, left_on = 'PPU', right_on = 'sitio_subida', how='left', suffixes=('','_ana'))
    df = df.merge(mauricio_turnstiles_df, left_on = 'PPU', right_on = 'sitio_subida' , suffixes=('_ana', '_mauricio'), how='left')
    print('Number of observations after merge info. of turnstile is: ' + str(len(df.index)))
    
    torniquetes_mariposa_conditions = (df.loc[:,'fecha_instalacion_ana'].dt.date<df.loc[:,'DATE'].dt.date)
    df.loc[:,'min_fecha'] = pd.concat([df['fecha_instalacion_ana'], df['fecha_instalacion_mauricio']], axis=1).min(axis=1)
    no_torniquetes_conditions = (((df.loc[:,'fecha_instalacion_ana'].isnull()) & (df.loc[:,'fecha_instalacion_mauricio'].isnull())) | (df.loc[:,'DATE'].dt.date<=df['min_fecha'].dt.date))
    df.loc[:,'torniquete_mariposa'] = np.where(torniquetes_mariposa_conditions,1,0)
    df.loc[:,'no_torniquete'] = np.where(no_torniquetes_conditions,1,0)

    new_year_day = dt.date(year=year, month=1, day=1)
    df.loc[:,'YEAR_DAY'] = df.loc[:,'DATE'].apply(lambda x: (x.date() - new_year_day).days + 1)
    df.loc[:,'MONTH'] = df.loc[:,'DATE'].dt.month
    df.loc[:,'YEAR'] = df.loc[:,'DATE'].dt.year
    
    df.loc[:,'SATURDAY'] = df.loc[:,'DATE'].apply(saturday)
    df.loc[:,'SUNDAY'] = df.loc[:,'DATE'].apply(sunday)    

    f = {'SUM_TRX':
         {'pn_SUM_TRX':['sum']},
         'COUNT':
         {'pn_SUM_EXP':['sum']}}
    
    grouped_df = df.groupby(['YEAR','MONTH','YEAR_DAY','SATURDAY','SUNDAY','UN','DATE','torniquete_mariposa','no_torniquete']).agg(f)
    grouped_df.columns = grouped_df.columns.droplevel(1)
    grouped_df.reset_index(inplace=True,level=['YEAR','MONTH','SATURDAY','SUNDAY','DATE','torniquete_mariposa','no_torniquete'])
    days = grouped_df.groupby(['YEAR_DAY','UN']).agg({'pn_SUM_EXP': 'sum'})
    grouped_df.loc[:,'ratio'] = grouped_df['pn_SUM_EXP'].div(days['pn_SUM_EXP'],axis='index') * 100
    
    return grouped_df

##### Creating function to analyze zp-stops years

In [9]:
def year_zp_analyses_trx(year):
    path = os.path.join(DTPM_TRXDir,'trxzp_' + str(year))
    df = pd.read_csv(path, sep=";", header=None, encoding='latin-1', parse_dates=[2])
    df.columns = ['UN','RMZP','DATE','TIPODIA','MHORA','PERIODO','TRX_VALIDAS','TARJETAS_NO_VALIDAS','TRX_NO_VALIDAS']
    print('Number of observations is: ' + str(len(df.index)))
    
    new_year_day = dt.date(year=year, month=1, day=1)
    df.loc[:,'YEAR_DAY'] = df.loc[:,'DATE'].apply(lambda x: (x.date() - new_year_day).days + 1)    
    df.loc[:,'MONTH'] = df.loc[:,'DATE'].dt.month
    df.loc[:,'YEAR'] = df.loc[:,'DATE'].dt.year
    
    df.loc[:,'SATURDAY'] = df.loc[:,'DATE'].apply(saturday)
    df.loc[:,'SUNDAY'] = df.loc[:,'DATE'].apply(sunday)   
    
    f = {'TRX_VALIDAS':
         {'zp_SUM_TRX':['sum']},
        'TRX_NO_VALIDAS':
        {'zp_SUM_TRX_NO_VALIDAS':['sum']}}
    
    grouped_df = df.groupby(['YEAR','MONTH','YEAR_DAY','SATURDAY','SUNDAY','UN','DATE']).agg(f)
    grouped_df.columns = grouped_df.columns.droplevel(1)

    return grouped_df

##### Colors and others settings for plotting

In [10]:
colors = [(76, 181, 245),(183, 184, 182),(52, 103, 92),(179, 193, 0)]
for i in range(len(colors)):
    r, g, b = colors[i]  
    colors[i] = (r / 255., g / 255., b / 255.)

In [11]:
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x*1e-6)

In [12]:
from matplotlib.ticker import FuncFormatter
formatter = FuncFormatter(millions)

##### Getting trxs in pn and zp in 2015, 2016 and 2017

In [13]:
tic = time.clock()

pn_grouped_2015_df = year_pn_analyses_trx(2015,ana_turnstiles_df,mauricio_turnstiles_df)
zp_grouped_2015_df = year_zp_analyses_trx(2015)

pn_grouped_2015_df.reset_index(inplace=True)
zp_grouped_2015_df.reset_index(inplace=True)

Number of observations before merge info. of turnstile is: 7787251
Number of observations after merge info. of turnstile is: 7787251


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Number of observations is: 592589


In [14]:
pn_grouped_2016_df = year_pn_analyses_trx(2016,ana_turnstiles_df,mauricio_turnstiles_df)
zp_grouped_2016_df = year_zp_analyses_trx(2016)

pn_grouped_2016_df.reset_index(inplace=True)
zp_grouped_2016_df.reset_index(inplace=True)

Number of observations before merge info. of turnstile is: 8034722
Number of observations after merge info. of turnstile is: 8034722


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Number of observations is: 591218


In [15]:
pn_grouped_2017_df = year_pn_analyses_trx(2017,ana_turnstiles_df,mauricio_turnstiles_df)
zp_grouped_2017_df = year_zp_analyses_trx(2017)

pn_grouped_2017_df.reset_index(inplace=True)
zp_grouped_2017_df.reset_index(inplace=True)

toc = time.clock()
print(toc-tic)

Number of observations before merge info. of turnstile is: 7963925
Number of observations after merge info. of turnstile is: 7963925


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Number of observations is: 755171
716.6214885312063


In [16]:
pn_frames = [pn_grouped_2015_df,pn_grouped_2016_df,pn_grouped_2017_df]
zp_frames = [zp_grouped_2015_df,zp_grouped_2016_df,zp_grouped_2017_df]

pn_summary = pd.concat(pn_frames)
zp_summary = pd.concat(zp_frames)

* Changing UN in ZP dataset

In [17]:
zp_summary['UN'] = zp_summary['UN'].apply(lambda x: x.split('-')[0].replace(' ',''))

* Finally, visualizing and counting

In [18]:
pn_summary.head()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,pn_SUM_EXP,ratio
0,1,U1,2015,1,0,0,2015-01-01,0,1,75529,2963,100.0
1,1,U2,2015,1,0,0,2015-01-01,0,0,116766,6586,99.141954
2,1,U2,2015,1,0,0,2015-01-01,0,1,1070,57,0.858046
3,1,U3,2015,1,0,0,2015-01-01,0,1,121938,7360,100.0
4,1,U4,2015,1,0,0,2015-01-01,0,1,108184,6129,100.0


In [19]:
zp_summary.head()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,SATURDAY,SUNDAY,UN,DATE,zp_SUM_TRX,zp_SUM_TRX_NO_VALIDAS
0,2015,1,1,0,0,U2,2015-01-01,301,1.0
1,2015,1,1,0,0,U3,2015-01-01,2901,47.0
2,2015,1,2,0,0,U1,2015-01-02,22015,779.0
3,2015,1,2,0,0,U2,2015-01-02,55909,1670.0
4,2015,1,2,0,0,U3,2015-01-02,32803,763.0


In [20]:
len(pn_summary.index)

11497

In [21]:
len(zp_summary.index)

6389

###### Appending fare. Test based on dataset length is <font color='green'> passed </font>

In [22]:
fares_path = os.path.join(DTPMDir,'08_Tarifas/Tarifas_2007_2017.xlsx')
fares_df = pd.read_excel(fares_path) #dates are already parsed

In [23]:
fares_df.loc[:,'YEAR'] = fares_df.loc[:,'Mes'].dt.year
fares_df.loc[:,'MONTH'] = fares_df.loc[:,'Mes'].dt.month

In [24]:
fares_df.head()

Unnamed: 0,Mes,Buses,Metro Hora Punta,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior,YEAR,MONTH
0,2007-02-01,380,440,380,360,130,2007,2
1,2007-03-01,380,440,380,360,130,2007,3
2,2007-04-01,380,440,380,360,130,2007,4
3,2007-05-01,380,440,380,360,130,2007,5
4,2007-06-01,380,440,380,360,130,2007,6


In [25]:
print('Length of pn_summary before 1st-merge is: ' + str(len(pn_summary.index)))
print('Length of zp_summary before 1st-merge is: ' + str(len(zp_summary.index)))

Length of pn_summary before 1st-merge is: 11497
Length of zp_summary before 1st-merge is: 6389


In [26]:
pn_summary = pn_summary.merge(fares_df,how='left',left_on=['YEAR','MONTH'],right_on=['YEAR','MONTH'])
zp_summary = zp_summary.merge(fares_df,how='left',left_on=['YEAR','MONTH'],right_on=['YEAR','MONTH'])

In [27]:
print('Length of pn_summary after 1st-merge is: ' + str(len(pn_summary.index)))
print('Length of zp_summary after 1st-merge is: ' + str(len(zp_summary.index)))

Length of pn_summary after 1st-merge is: 11497
Length of zp_summary after 1st-merge is: 6389


In [28]:
pn_summary.head()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,pn_SUM_EXP,ratio,Mes,Buses,Metro Hora Punta,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior
0,1,U1,2015,1,0,0,2015-01-01,0,1,75529,2963,100.0,2015-01-01,640,720,660,610,210
1,1,U2,2015,1,0,0,2015-01-01,0,0,116766,6586,99.141954,2015-01-01,640,720,660,610,210
2,1,U2,2015,1,0,0,2015-01-01,0,1,1070,57,0.858046,2015-01-01,640,720,660,610,210
3,1,U3,2015,1,0,0,2015-01-01,0,1,121938,7360,100.0,2015-01-01,640,720,660,610,210
4,1,U4,2015,1,0,0,2015-01-01,0,1,108184,6129,100.0,2015-01-01,640,720,660,610,210


##### Appending kms_comerciales validamente ofertados. Test based on dataset length is <font color='green'> passed </font>

In [29]:
kms_path = os.path.join(DTPMDir,'06_LBS/1_consolidados/kms_recorridos.txt')
kms_df = pd.read_table(kms_path,sep=';', encoding='latin-1',index_col = 0) #Dates are not parsed. m_ofertados are not parsed.

In [30]:
kms_df['Fecha'] = kms_df['Fecha'].apply(lambda x: pd.to_datetime(x, format='%d/%m/%Y'))

In [31]:
kms_df.head()

Unnamed: 0,UN,Fecha,m_ofertados
0,U1,2015-01-01,7083279485
1,U1,2015-01-02,13630690709
2,U1,2015-01-03,12494136755
3,U1,2015-01-04,19859020609
4,U1,2015-01-05,13653204066


In [32]:
def year_day_calc(x):
    if(x[1].year==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[1].year==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[1].date() - new_year_day).days + 1)

In [33]:
kms_df['YEAR_DAY'] = kms_df.apply(year_day_calc, axis=1)    
kms_df['MONTH'] = kms_df['Fecha'].dt.month
kms_df['YEAR'] = kms_df['Fecha'].dt.year

In [34]:
kms_df['m_ofertados'] = kms_df['m_ofertados'].apply(lambda x: x.replace(',','.'))
kms_df['m_ofertados'] = kms_df['m_ofertados'].apply(lambda x: float(x))

In [35]:
kms_df.head()

Unnamed: 0,UN,Fecha,m_ofertados,YEAR_DAY,MONTH,YEAR
0,U1,2015-01-01,70832790.0,1,1,2015
1,U1,2015-01-02,136306900.0,2,1,2015
2,U1,2015-01-03,124941400.0,3,1,2015
3,U1,2015-01-04,198590200.0,4,1,2015
4,U1,2015-01-05,136532000.0,5,1,2015


In [36]:
grouped_kms_df = kms_df.groupby(['YEAR','MONTH','YEAR_DAY','UN'])['m_ofertados'].sum().to_frame().reset_index()

In [37]:
grouped_kms_df.head()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,UN,m_ofertados
0,2015,1,1,U1,70832790.0
1,2015,1,1,U2,136634800.0
2,2015,1,1,U3,108652700.0
3,2015,1,1,U4,143630300.0
4,2015,1,1,U5,110014500.0


In [38]:
pn_summary = pn_summary.merge(grouped_kms_df,how='left',left_on=['YEAR','MONTH','YEAR_DAY','UN'],right_on=['YEAR','MONTH','YEAR_DAY','UN'])
zp_summary = zp_summary.merge(grouped_kms_df,how='left',left_on=['YEAR','MONTH','YEAR_DAY','UN'],right_on=['YEAR','MONTH','YEAR_DAY','UN'])

In [39]:
print('Length of pn_summary after 2nd-merge is: ' + str(len(pn_summary.index)))
print('Length of zp_summary after 2nd-merge is: ' + str(len(zp_summary.index)))

Length of pn_summary after 2nd-merge is: 11497
Length of zp_summary after 2nd-merge is: 6389


In [40]:
pn_summary.tail()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,pn_SUM_EXP,ratio,Mes,Buses,Metro Hora Punta,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados
11492,365,U5,2017,12,0,1,2017-12-31,1,0,51570,1763,38.560805,2017-12-01,640,740,660,610,210,106362900.0
11493,365,U6,2017,12,0,1,2017-12-31,0,1,28454,2159,29.434219,2017-12-01,640,740,660,610,210,99905070.0
11494,365,U6,2017,12,0,1,2017-12-31,1,0,62228,5176,70.565781,2017-12-01,640,740,660,610,210,99905070.0
11495,365,U7,2017,12,0,1,2017-12-31,0,1,23363,1742,43.129487,2017-12-01,640,740,660,610,210,67099300.0
11496,365,U7,2017,12,0,1,2017-12-31,1,0,43325,2297,56.870513,2017-12-01,640,740,660,610,210,67099300.0


In [41]:
zp_summary.tail()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,SATURDAY,SUNDAY,UN,DATE,zp_SUM_TRX,zp_SUM_TRX_NO_VALIDAS,Mes,Buses,Metro Hora Punta,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados
6384,2017,12,364,1,0,U3,2017-12-30,14274,230.0,2017-12-01,640,740,660,610,210,182193700.0
6385,2017,12,364,1,0,U5,2017-12-30,40347,1149.0,2017-12-01,640,740,660,610,210,145542500.0
6386,2017,12,364,1,0,U7,2017-12-30,3968,43.0,2017-12-01,640,740,660,610,210,74878400.0
6387,2017,12,365,0,1,U2,2017-12-31,6220,90.0,2017-12-01,640,740,660,610,210,152215600.0
6388,2017,12,365,0,1,U3,2017-12-31,7938,129.0,2017-12-01,640,740,660,610,210,142638700.0


In [42]:
pn_summary['kms_ofertados'] = pn_summary['m_ofertados'].apply(lambda x: x/1000)
zp_summary['kms_ofertaods'] = zp_summary['m_ofertados'].apply(lambda x: x/1000)

In [43]:
pn_summary.tail()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,pn_SUM_EXP,ratio,Mes,Buses,Metro Hora Punta,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados,kms_ofertados
11492,365,U5,2017,12,0,1,2017-12-31,1,0,51570,1763,38.560805,2017-12-01,640,740,660,610,210,106362900.0,106362.85625
11493,365,U6,2017,12,0,1,2017-12-31,0,1,28454,2159,29.434219,2017-12-01,640,740,660,610,210,99905070.0,99905.06875
11494,365,U6,2017,12,0,1,2017-12-31,1,0,62228,5176,70.565781,2017-12-01,640,740,660,610,210,99905070.0,99905.06875
11495,365,U7,2017,12,0,1,2017-12-31,0,1,23363,1742,43.129487,2017-12-01,640,740,660,610,210,67099300.0,67099.3025
11496,365,U7,2017,12,0,1,2017-12-31,1,0,43325,2297,56.870513,2017-12-01,640,740,660,610,210,67099300.0,67099.3025


###### Creating dummy variables for Enero, Febrero and Julio => Estival and Invierno

In [44]:
pn_summary['Enero'] = pn_summary['MONTH'].apply(lambda x: 1 if x==1 else 0)
pn_summary['Febrero'] = pn_summary['MONTH'].apply(lambda x: 1 if x==2 else 0)
pn_summary['Julio'] = pn_summary['MONTH'].apply(lambda x: 1 if x==7 else 0)

zp_summary['Enero'] = zp_summary['MONTH'].apply(lambda x: 1 if x==1 else 0)
zp_summary['Febrero'] = zp_summary['MONTH'].apply(lambda x: 1 if x==2 else 0)
zp_summary['Julio'] = zp_summary['MONTH'].apply(lambda x: 1 if x==7 else 0)

###### Creating dummy variables for Noviembre and Diciembre 2017, since implementation of L6 was made during these months.

In [45]:
def dummy_noviembre_2017(x):
    if((x[2]==2017)&(x[3]==11)):
        return 1
    else:
        return 0

def dummy_diciembre_2017(x):
    if((x[2]==2017)&(x[3]==12)):
        return 1
    else:
        return 0

pn_summary['Nov_2017'] = pn_summary.apply(dummy_noviembre_2017, axis=1)
pn_summary['Dic_2017'] = pn_summary.apply(dummy_diciembre_2017, axis=1)

zp_summary['Nov_2017'] = zp_summary.apply(dummy_noviembre_2017, axis=1)
zp_summary['Dic_2017'] = zp_summary.apply(dummy_diciembre_2017, axis=1)

In [46]:
pn_summary.tail()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,...,Metro Hora Valle,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados,kms_ofertados,Enero,Febrero,Julio,Nov_2017,Dic_2017
11492,365,U5,2017,12,0,1,2017-12-31,1,0,51570,...,660,610,210,106362900.0,106362.85625,0,0,0,0,1
11493,365,U6,2017,12,0,1,2017-12-31,0,1,28454,...,660,610,210,99905070.0,99905.06875,0,0,0,0,1
11494,365,U6,2017,12,0,1,2017-12-31,1,0,62228,...,660,610,210,99905070.0,99905.06875,0,0,0,0,1
11495,365,U7,2017,12,0,1,2017-12-31,0,1,23363,...,660,610,210,67099300.0,67099.3025,0,0,0,0,1
11496,365,U7,2017,12,0,1,2017-12-31,1,0,43325,...,660,610,210,67099300.0,67099.3025,0,0,0,0,1


###### Creating temporal variable

In [47]:
pn_summary.loc[pn_summary['YEAR']==2015,'YEAR_DAY'].max()

365

In [48]:
pn_summary.loc[pn_summary['YEAR']==2016,'YEAR_DAY'].max()

366

In [49]:
pn_summary.loc[pn_summary['YEAR']==2017,'YEAR_DAY'].max()

365

In [50]:
def temporal_variable_pn(x):
    if(x[2]==2015):
        return x[0]
    elif(x[2]==2016):
        return 365+x[0]
    else:
        return 365+366+x[0]
    
def temporal_variable_zp(x):
    if(x[0]==2015):
        return x[2]
    elif(x[0]==2016):
        return 365+x[2]
    else:
        return 365+366+x[2]

In [51]:
pn_summary['t'] = pn_summary.apply(temporal_variable_pn, axis=1)

zp_summary['t'] = zp_summary.apply(temporal_variable_zp, axis=1)

In [52]:
pn_summary.tail()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,...,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados,kms_ofertados,Enero,Febrero,Julio,Nov_2017,Dic_2017,t
11492,365,U5,2017,12,0,1,2017-12-31,1,0,51570,...,610,210,106362900.0,106362.85625,0,0,0,0,1,1096
11493,365,U6,2017,12,0,1,2017-12-31,0,1,28454,...,610,210,99905070.0,99905.06875,0,0,0,0,1,1096
11494,365,U6,2017,12,0,1,2017-12-31,1,0,62228,...,610,210,99905070.0,99905.06875,0,0,0,0,1,1096
11495,365,U7,2017,12,0,1,2017-12-31,0,1,23363,...,610,210,67099300.0,67099.3025,0,0,0,0,1,1096
11496,365,U7,2017,12,0,1,2017-12-31,1,0,43325,...,610,210,67099300.0,67099.3025,0,0,0,0,1,1096


In [53]:
zp_summary.tail()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,SATURDAY,SUNDAY,UN,DATE,zp_SUM_TRX,zp_SUM_TRX_NO_VALIDAS,Mes,...,Metro Hora Baja,Estudiantes Ed. Media/Superior,m_ofertados,kms_ofertaods,Enero,Febrero,Julio,Nov_2017,Dic_2017,t
6384,2017,12,364,1,0,U3,2017-12-30,14274,230.0,2017-12-01,...,610,210,182193700.0,182193.73044,0,0,0,0,0,1095
6385,2017,12,364,1,0,U5,2017-12-30,40347,1149.0,2017-12-01,...,610,210,145542500.0,145542.5,0,0,0,0,0,1095
6386,2017,12,364,1,0,U7,2017-12-30,3968,43.0,2017-12-01,...,610,210,74878400.0,74878.405,0,0,0,0,0,1095
6387,2017,12,365,0,1,U2,2017-12-31,6220,90.0,2017-12-01,...,610,210,152215600.0,152215.63875,0,0,0,0,0,1096
6388,2017,12,365,0,1,U3,2017-12-31,7938,129.0,2017-12-01,...,610,210,142638700.0,142638.69675,0,0,0,0,0,1096


###### Creating dummy variables per type of special days. Test based on dataset length is <font color='green'> passed </font>

In [54]:
DES_path = os.path.join(DTPMDir,'07_DES/resumen_des.xlsx')
DES_df = pd.read_excel(DES_path) #Dates are already parsed.

In [55]:
DES_df.head()

Unnamed: 0,Fecha,Descripción,REALIZA_BUCLE,Feriado_laboral,Feriado_no_laboral,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
0,2015-01-01,Año Nuevo 2015,-,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
1,2015-01-02,Interferiado año nuevo 2015,-,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
2,2015-01-03,Fin de semana largo por año nuevo,-,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
3,2015-01-04,Fin de semana largo por año nuevo,-,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
4,2015-03-29,Día del joven combatiente,-,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0


* Days with lack of information

In [56]:
DES_df.loc[DES_df['Corte_Metro'].isnull(),:]

Unnamed: 0,Fecha,Descripción,REALIZA_BUCLE,Feriado_laboral,Feriado_no_laboral,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
318,2016-03-26,Línea 1,NO,0,0,0,0,0,0,,,,0.0,0,0


In [57]:
DES_df.loc[DES_df['Bucle'].isnull(),:]

Unnamed: 0,Fecha,Descripción,REALIZA_BUCLE,Feriado_laboral,Feriado_no_laboral,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
380,2016-10-07,Corte Línea 2,-,0,0,0,0,0,0,1.0,0.0,0.0,,0,0
452,2017-06-16,Corte Línea 5,-,0,0,0,0,0,0,1.0,0.0,0.0,,0,0
455,2017-06-16,Retraso Línea 4,-,0,0,0,0,0,0,0.0,1.0,0.0,,0,0
469,2017-07-25,Retraso Línea 1,-,0,0,0,0,0,0,0.0,1.0,0.0,,0,0


In [58]:
def f(x):
    suma_dummies = x.sum(skipna=False) #Sums with NaNs return NaN. Day descriptions are omitted.
    if(suma_dummies>1):
        return 1
    else:
        return suma_dummies

In [59]:
grouped_DES_df = DES_df.groupby(['Fecha']).agg(f).reset_index()

In [60]:
grouped_DES_df.head()

Unnamed: 0,Fecha,Feriado_laboral,Feriado_no_laboral,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
0,2015-01-01,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
1,2015-01-02,1,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
2,2015-01-03,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
3,2015-01-04,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0
4,2015-01-14,0,0,0,0,0,0,1.0,0.0,0.0,1.0,0,0


In [61]:
def year_day_calc_2(x):
    if(x[0].year==2015):
        new_year_day = dt.date(year=2015, month=1, day=1)
    elif(x[0].year==2016):
        new_year_day = dt.date(year=2016, month=1, day=1)
    else:
        new_year_day = dt.date(year=2017, month=1, day=1)
        
    return ((x[0].date() - new_year_day).days + 1)

In [62]:
grouped_DES_df['YEAR_DAY'] = grouped_DES_df.apply(year_day_calc_2, axis=1)    
grouped_DES_df['MONTH'] = grouped_DES_df['Fecha'].dt.month
grouped_DES_df['YEAR'] = grouped_DES_df['Fecha'].dt.year

In [63]:
grouped_DES_df.tail()

Unnamed: 0,Fecha,Feriado_laboral,Feriado_no_laboral,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales,YEAR_DAY,MONTH,YEAR
381,2017-12-27,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,361,12,2017
382,2017-12-28,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,362,12,2017
383,2017-12-29,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,1,363,12,2017
384,2017-12-30,0,0,0,0,1,0,1.0,0.0,0.0,1.0,0,0,364,12,2017
385,2017-12-31,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,365,12,2017


In [64]:
pn_summary = pn_summary.merge(grouped_DES_df,how='left',left_on=['YEAR','MONTH','YEAR_DAY'],right_on=['YEAR','MONTH','YEAR_DAY'])
zp_summary = zp_summary.merge(grouped_DES_df,how='left',left_on=['YEAR','MONTH','YEAR_DAY'],right_on=['YEAR','MONTH','YEAR_DAY'])

In [65]:
print('Length of pn_summary after 3rd-merge is: ' + str(len(pn_summary.index)))
print('Length of zp_summary after 3rd-merge is: ' + str(len(zp_summary.index)))

Length of pn_summary after 3rd-merge is: 11497
Length of zp_summary after 3rd-merge is: 6389


In [66]:
pn_summary.head()

Unnamed: 0,YEAR_DAY,UN,YEAR,MONTH,SATURDAY,SUNDAY,DATE,torniquete_mariposa,no_torniquete,pn_SUM_TRX,...,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
0,1,U1,2015,1,0,0,2015-01-01,0,1,75529,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,U2,2015,1,0,0,2015-01-01,0,0,116766,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,U2,2015,1,0,0,2015-01-01,0,1,1070,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,U3,2015,1,0,0,2015-01-01,0,1,121938,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,U4,2015,1,0,0,2015-01-01,0,1,108184,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
zp_summary.head()

Unnamed: 0,YEAR,MONTH,YEAR_DAY,SATURDAY,SUNDAY,UN,DATE,zp_SUM_TRX,zp_SUM_TRX_NO_VALIDAS,Mes,...,Censo_Elecciones,Partido,FDS_Largo,Disturbios,Corte_Metro,Retraso_Metro,Incidente_Metro,Bucle,Clima,visperas_laborales
0,2015,1,1,0,0,U2,2015-01-01,301,1.0,2015-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015,1,1,0,0,U3,2015-01-01,2901,47.0,2015-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015,1,2,0,0,U1,2015-01-02,22015,779.0,2015-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015,1,2,0,0,U2,2015-01-02,55909,1670.0,2015-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015,1,2,0,0,U3,2015-01-02,32803,763.0,2015-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
pn_summary = pn_summary.fillna(0) #Assuming 0 values in NaNs obtained via f(x)
zp_summary = zp_summary.fillna(0) #Assuming 0 values in NaNs obtained via f(x)

###### Printing summaries to a file

In [69]:
def printingSummaryToFile(UN):
    UN_pn_summary = pn_summary.loc[pn_summary['UN']==UN,:]
    UN_zp_summary = zp_summary.loc[zp_summary['UN']==UN,:]
    
    pn_output_path = os.path.join(DTPM_TRXDir, '2_DAILY_UN_SUMMARY/' + UN + '/' + UN + '_daily_pn_summary.csv')
    zp_output_path = os.path.join(DTPM_TRXDir, '2_DAILY_UN_SUMMARY/' + UN + '/' + UN + '_daily_zp_summary.csv')
    
    UN_pn_summary.to_csv(pn_output_path, sep=';', encoding = 'latin-1')
    UN_zp_summary.to_csv(zp_output_path, sep=';', encoding = 'latin-1')

In [70]:
#printingSummaryToFile('U1')

In [71]:
#printingSummaryToFile('U2')

In [72]:
#printingSummaryToFile('U3')

In [73]:
#printingSummaryToFile('U4')

In [74]:
#printingSummaryToFile('U5')

In [75]:
#printingSummaryToFile('U6')

In [76]:
#printingSummaryToFile('U7')

# Closed