# Matching users as client of EvasionBuilder and MatchingUsers modules

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from importlib import reload
import datetime as dt
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import EvasionBuilder
from Utils import TransantiagoConstants

* Building complete evasion database via <strong>EvasionBuilder</strong>

In [2]:
[first_q, second_q, third_q] = EvasionBuilder.loadSinglesEvasion()

complete_evasion = EvasionBuilder.processSinglesEvasiondAndConcat(first_q,second_q,third_q)

complete_evasion_w_codes = EvasionBuilder.mergeTransantiagoCodes(complete_evasion)

processed_evasion = EvasionBuilder.processCompleteEvasionDataFrame(complete_evasion_w_codes)

clean_processed_evasion = EvasionBuilder.deleteDuplicatedInCompleteEvasion(processed_evasion)

The only non-matched user_code services are: 
D06
Number of duplicated rows in complete evasion database is: 6362
Number of collapsed-duplicated rows in complete evasion database is: 3175
Number of rows in complete evasion database without duplicated rows at all is: 93060
Final number of rows in complete evasion database with collapsed duplicated rows is: 96235


* Processing complete evasion database depending on necessity

In [3]:
evasion_paradero = clean_processed_evasion[clean_processed_evasion['TP']=='P']
evasion_paradero_first = evasion_paradero[evasion_paradero['N_PUERTA']==1]

In [4]:
print('Number of rows in complete evasion database is: ' + str(len(clean_processed_evasion.index)))
print('Number of rows in evasion in paradero database is: ' + str(len(evasion_paradero.index)))
print('Number of rows in evasion in paradero by first door database is: ' + str(len(evasion_paradero_first.index)))

Number of rows in complete evasion database is: 96235
Number of rows in evasion in paradero database is: 94531
Number of rows in evasion in paradero by first door database is: 33755


* Filtering dates not in common dates. DataFrame "common_dates_evasion" will be the final evasion DataFrame

In [5]:
common_dates = TransantiagoConstants.common_dates
common_dates_timestamp = [pd.to_datetime(x) for x in common_dates]
common_dates_evasion = evasion_paradero_first[evasion_paradero_first['FECHA'].isin(common_dates_timestamp)]

print('Number of rows in complete evasion database filtered by common_dates is: ' + str(len(common_dates_evasion.index))) #Remember to store the number.

common_dates_evasion.loc[:,'TIEMPO'] = common_dates_evasion.loc[:,'FECHA'].dt.strftime('%Y-%m-%d') + ' ' + common_dates_evasion.loc[:,'TIEMPO']
common_dates_evasion.loc[:,'TIEMPO'] = pd.to_datetime(common_dates_evasion.loc[:,'TIEMPO'])
common_dates_evasion = common_dates_evasion.reset_index(drop =True)

Number of rows in complete evasion database filtered by common_dates is: 14756


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


* Building Etapas DataBase via <strong>RunSilentlyDailyEtapasBuilder</strong> and matching users via <strong>MatchingUsers</strong>

In [7]:
common_dates = common_dates[:1]

In [8]:
from RunSilentlyDailyEtapasBuilder import RunSilentlyDailyEtapasBuilderClass

In [9]:
import MatchingUsers

In [10]:
for date in common_dates:    
    etapas_builder = RunSilentlyDailyEtapasBuilderClass(date)
    processed_sorted_df = etapas_builder.runProcessedProcess()
#    processed_sorted_df = processed_sorted_df[processed_sorted_df['diferencia_tiempo_secs']<=20] <- BE AWARE OF THIS LINE...
    processed_sorted_df['sitio_subida'] = processed_sorted_df['sitio_subida'].str.replace("-", "")
    processed_sorted_df['sitio_subida'] = processed_sorted_df['sitio_subida'].str.replace(" ", "")
    processed_sorted_df['servicio_subida'] =  processed_sorted_df['servicio_subida'].str.replace('T','')
    processed_sorted_df['servicio_subida'] =  processed_sorted_df['servicio_subida'].str.split(' ').str[0]
    
    [evasion_by_date, patentes_by_date, servicios_by_date] = MatchingUsers.simplifyingEvasionAndEtapas(common_dates_evasion,date)
    
    clean_sorted_df = processed_sorted_df[(processed_sorted_df['sitio_subida'].isin(patentes_by_date))&(processed_sorted_df['servicio_subida'].isin(servicios_by_date))]
    clean_sorted_df = clean_sorted_df.reset_index(drop=True)
    
    clean_sorted_df = MatchingUsers.appendingIdExpedicion(clean_sorted_df)
    grouped_clean_sorted_df = MatchingUsers.groupByEtapasDatabase(clean_sorted_df)    
    grouped_clean_sorted_df = MatchingUsers.appendingStartEndCuts(grouped_clean_sorted_df)
    grouped_clean_sorted_df = MatchingUsers.slicingEvasionDatabase(grouped_clean_sorted_df,evasion_by_date)

Not found in turnstile database: 1795630


In [11]:
grouped_clean_sorted_df.head()

Unnamed: 0,sitio_subida,servicio_subida,idExpedicion,par_subida,t_subidamin,t_subidamax,t_subidacount,diferencia_tiempo_secsmean,start_cut,end_cut,count_ev_obs,ingresan,no_validan
0,BKWK90,F05,,L-34-54-115-SN,2017-03-09 07:04:25,2017-03-09 07:04:25,1,,2017-03-09 07:03:55,2017-03-09 07:04:46.500000,0.0,0.0,0.0
1,BKWK90,F05,,L-34-54-100-SN,2017-03-09 07:05:08,2017-03-09 07:05:08,1,,2017-03-09 07:04:46.500000,2017-03-09 07:06:45.500000,0.0,0.0,0.0
2,BKWK90,F05,,L-34-68-40-SN,2017-03-09 07:08:23,2017-03-09 09:49:32,28,2.461538,2017-03-09 07:06:45.500000,2017-03-09 08:29:41.500000,0.0,0.0,0.0
3,BKWK90,F05,,L-34-68-30-SN,2017-03-09 07:09:51,2017-03-09 09:50:31,15,1.923077,2017-03-09 08:29:41.500000,2017-03-09 08:30:35.500000,0.0,0.0,0.0
4,BKWK90,F05,,L-34-68-20-SN,2017-03-09 07:10:40,2017-03-09 09:51:06,9,2.428571,2017-03-09 08:30:35.500000,2017-03-09 08:31:18,0.0,0.0,0.0
