### Analyzing all-dates descriptives

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime as dt
import time

from importlib import reload
from Utils import TransantiagoConstants
from RunSilentlyDailyEtapasBuilder import RunSilentlyDailyEtapasBuilderClass

In [None]:
from TemporalDescriptivesBuilder import TemporalDescriptivesBuilderClass

In [None]:
dates = TransantiagoConstants.updateCurrentSSHDates()
summary_dir = TransantiagoConstants.SummaryDir

* Only to re-factor

In [None]:
#date = dates[0]
#dates = ['2017-03-01']

*****

In [None]:

tic = time.clock()
for date in dates:
    temporal_descriptives_builder = TemporalDescriptivesBuilderClass(date)
    temporal_descriptives_builder.cleanAndProcessEtapas()
    temporal_descriptives_builder.appendPeriods()
    temporal_descriptives_builder.groupData()
    temporal_descriptives_builder.appendUnidadNegocio()
    grouped_data = temporal_descriptives_builder.grouped_data
    grouped_data['fecha']=date
    name_file = date + '_sumtrx.csv'
    path = os.path.join(summary_dir,name_file)
    grouped_data.to_csv(path,sep=';')

toc = time.clock()
print(toc-tic)


In [None]:
quantile = 0.97
processed = 0

tic = time.clock()

for date in dates:
    etapas_builder = RunSilentlyDailyEtapasBuilderClass(date)
    [etapas_df, processed_sorted_df, filtered_df, filtered_turnstile_df, filtered_no_turnstile_df]=etapas_builder.runCompleteProcess()
    
    #Pre-processing processed_sorted_df. Be aware that etapas_df only contains obs. of type 'BUS'
    processed_sorted_df.loc[:,'servicio_subida'] = processed_sorted_df.loc[:,'servicio_subida'].str.replace('T','')
    processed_sorted_df.loc[:,'servicio_subida'] = processed_sorted_df.loc[:,'servicio_subida'].str.replace('00','')
    processed_sorted_df.loc[:,'TS_CODE'] = processed_sorted_df.loc[:,'servicio_subida'].str.split(' ').str[0]
    processed_sorted_df.loc[:,'DIRECTION'] = processed_sorted_df.loc[:,'servicio_subida'].str.split(' ').str[1]
    processed_sorted_df.loc[:,'DIRECTION'] = processed_sorted_df.loc[:,'DIRECTION'].str.replace('R','Ret')
    processed_sorted_df.loc[:,'DIRECTION'] = processed_sorted_df.loc[:,'DIRECTION'].str.replace('I','Ida')
    merged_etapas_df = pd.merge(processed_sorted_df,codes, on=['TS_CODE','DIRECTION'], how='left')

    #Filling trx info:
    total_raw_trx.append(len(etapas_df.index))
    total_trx.append(len(processed_sorted_df.index))
    total_turns_trx.append(processed_sorted_df[processed_sorted_df.si_2017_torniquete==True].count()[0])
    total_no_turns_trx.append(processed_sorted_df[processed_sorted_df.si_torniquete==False].count()[0])
        
    #Filtering and following the original methodology:
    turnstile_q = filtered_turnstile_df.diferencia_tiempo.quantile(quantile)
    no_turnstile_q = filtered_no_turnstile_df.diferencia_tiempo.quantile(quantile)    
    
    filtered_turnstile_q_df = filtered_turnstile_df[filtered_turnstile_df.diferencia_tiempo<=turnstile_q]
    filtered_no_turnstile_q_df = filtered_no_turnstile_df[filtered_no_turnstile_df.diferencia_tiempo<=no_turnstile_q]
    
    turnstile_obs.append(len(filtered_turnstile_q_df.index))
    turnstile_means.append(filtered_turnstile_q_df['diferencia_tiempo_secs'].mean())
    turnstile_stds.append(filtered_turnstile_q_df['diferencia_tiempo_secs'].std())
    turnstile_medians.append(filtered_turnstile_q_df['diferencia_tiempo_secs'].quantile(0.50))
    turnstile_maxs.append(filtered_turnstile_q_df['diferencia_tiempo_secs'].max())
    
    no_turnstile_obs.append(len(filtered_no_turnstile_q_df.index))
    no_turnstile_means.append(filtered_no_turnstile_q_df['diferencia_tiempo_secs'].mean())
    no_turnstile_stds.append(filtered_no_turnstile_q_df['diferencia_tiempo_secs'].std())    
    no_turnstile_medians.append(filtered_no_turnstile_q_df['diferencia_tiempo_secs'].quantile(0.50))
    no_turnstile_maxs.append(filtered_no_turnstile_q_df['diferencia_tiempo_secs'].max())
    
    processed = processed + 1
    print('Number of already processed ddbb is ' + str(processed))
    del etapas_builder
    
toc = time.clock()  
toc-tic

* Summarizing

In [None]:
dates = [dt.datetime.strptime(x, "%Y-%m-%d").date() for x in dates]

In [None]:
summary_descriptives = pd.DataFrame(
    {'date': dates,
     'total_raw_trx': total_raw_trx,
     'total_trx': total_trx,
     'total_turns_trx': total_turns_trx,
     'total_no_turns_trx': total_no_turns_trx,
     'turnstile_obs':turnstile_obs,
     'turnstile_means': turnstile_means,
     'turnstile_stds': turnstile_stds,
     'turnstile_medians': turnstile_medians,
     'turnstile_maxs': turnstile_maxs,
     'no_turnstile_obs': no_turnstile_obs,
     'no_turnstile_means': no_turnstile_means,
     'no_turnstile_stds': no_turnstile_stds,
     'no_turnstile_medians': no_turnstile_medians,
     'no_turnstile_maxs': no_turnstile_maxs})

In [None]:
cols = ['date','total_raw_trx','total_trx','total_turns_trx','total_no_turns_trx',
        'turnstile_obs','turnstile_means','turnstile_stds','turnstile_medians','turnstile_maxs',
        'no_turnstile_obs','no_turnstile_means','no_turnstile_stds', 'no_turnstile_medians', 'no_turnstile_maxs']
summary_descriptives = summary_descriptives[cols]

* Checking

In [None]:
summary_descriptives

* Writting the ddbb to a file

In [None]:
#summary_descriptives.to_csv('C:/Users/Tesista/Desktop/Evasion/01_analisis/03_datos/05_SUMMARY/DescriptivesSummary.csv', sep=';', encoding = 'latin-1')

In [None]:
import TemporalDescriptivesBuilder

In [None]:
from TemporalDescriptivesBuilder import TemporalDescriptivesBuilderClass

In [None]:
test_temporal_builder = 