In [1]:
import pandas as pd
import numpy as np

# Modelagem
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import ElasticNet 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Carregamento dos Dados

In [75]:
path = "../../data/processed/"
v = pd.read_csv(path + "V_setembro.csv")
fgr = pd.read_csv(path + "FGR.csv", decimal=",")
paradas = pd.read_csv(path + "paradas.csv")
rota_paradas = pd.read_csv(path + "rota_paradas.csv")
rotas = pd.read_csv(path + "rotas.csv")
dados_bairros = pd.read_csv(path + "dados_bairros.csv")

In [None]:
v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15560968 entries, 0 to 15560967
Data columns (total 31 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   Unnamed: 0                      int64  
 1   movimentos_data_arq             object 
 2   movimento_diario_data_mov       object 
 3   categoria_tipo                  int64  
 4   empresa_codigo                  int64  
 5   empresa_modalidade              int64  
 6   veiculo_numero                  int64  
 7   veiculo_validador               object 
 8   linha_numero                    int64  
 9   linha_jornada                   int64  
 10  linha_num_operador              int64  
 11  linha_tabela                    int64  
 12  linha_hora_abertura             object 
 13  linha_hora_fechamento           object 
 14  viagem_data_hora_abertura       object 
 15  viagem_data_hora_fechamento     object 
 16  viagem_catraca_inicio           int64  
 17  viagem_catraca_final     

# Feature Enginerring

Construção de variáveis
- Demanda
- Tamanho da linha
- Horario
- IDH dos bairros
- É domingo?

In [77]:
demanda_p_linha_dia = v.groupby(["linha_numero", "movimento_diario_data_mov"]).count()["passageiro_matricula"].to_frame().reset_index()
demanda_p_linha_dia.rename(columns={"passageiro_matricula": "demanda_do_dia"}, inplace = True)

In [78]:
demanda_p_linha_dia

Unnamed: 0,linha_numero,movimento_diario_data_mov,demanda_do_dia
0,1,2021-08-29,232
1,1,2021-08-31,9096
2,1,2021-09-01,8888
3,1,2021-09-02,8244
4,1,2021-09-03,9020
...,...,...,...
8838,991,2021-09-02,55
8839,999,2021-09-05,350
8840,999,2021-09-12,305
8841,999,2021-09-19,365


In [79]:
# Demanda por intervalo de tempo
v["passageiro_data_hora"] = pd.to_datetime(v["passageiro_data_hora"])
v["passageiro_hora_entrada"] = v["passageiro_data_hora"].dt.hour
# v["passageiro_hora_range"] = pd.cut(v["passageiro_hora_entrada"], bins=[-1, 4, 6, 9, 12, 15, 18, 21, float('Inf')], labels=['00-3h59', '4-5h59', '6-8h59', '9-11h59', '12-14h59', '15-17h59', '18-20h59', '21-23h59'])
v_hora_range = v.groupby(["linha_numero", "movimento_diario_data_mov", "passageiro_hora_entrada"]).count()["passageiro_matricula"].to_frame().reset_index()
v_hora_range.dropna(inplace=True)

In [80]:
v_hora_range

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_entrada,passageiro_matricula
0,1,2021-08-29,6,4
1,1,2021-08-29,7,40
2,1,2021-08-29,8,31
3,1,2021-08-29,9,3
4,1,2021-08-29,10,24
...,...,...,...,...
158001,999,2021-09-26,14,55
158002,999,2021-09-26,15,55
158003,999,2021-09-26,16,37
158004,999,2021-09-26,17,136


In [81]:
# Dias da semana
days_week ={
    0: 'Segunda', 
    1: 'Terça', 
    2: 'Quarta', 
    3: 'Quinta', 
    4: 'Sexta',
    5: 'Sábado', 
    6: 'Domingo'
}

v_hora_range["movimento_diario_data_mov"] = pd.to_datetime(v_hora_range["movimento_diario_data_mov"])

def eh_domingo(dia_semana):
    if dia_semana == "Domingo":
        return 1
    else:
        return 0

v_hora_range["Dia_Semana"] = v_hora_range.movimento_diario_data_mov.dt.weekday
v_hora_range["Dia_Semana"] = v_hora_range.Dia_Semana.map(days_week)
v_hora_range["Domingo"] = v_hora_range["Dia_Semana"].map(eh_domingo)

In [82]:
v_hora_range

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_entrada,passageiro_matricula,Dia_Semana,Domingo
0,1,2021-08-29,6,4,Domingo,1
1,1,2021-08-29,7,40,Domingo,1
2,1,2021-08-29,8,31,Domingo,1
3,1,2021-08-29,9,3,Domingo,1
4,1,2021-08-29,10,24,Domingo,1
...,...,...,...,...,...,...
158001,999,2021-09-26,14,55,Domingo,1
158002,999,2021-09-26,15,55,Domingo,1
158003,999,2021-09-26,16,37,Domingo,1
158004,999,2021-09-26,17,136,Domingo,1


 Variável de km da rota

In [83]:

linha_p_km_programado = fgr[["fechamento_data", "linha_numero", "linha_km_programado"]].drop_duplicates()
linha_p_km_programado = linha_p_km_programado[linha_p_km_programado.linha_km_programado != 0]
linha_demanda_km = demanda_p_linha_dia.merge(linha_p_km_programado, how="inner", left_on=["linha_numero"], right_on=["linha_numero"])
linha_demanda_km.drop(columns=["fechamento_data"], inplace=True)
linha_demanda_km = linha_demanda_km.groupby(["linha_numero"]).mean()[["linha_km_programado"]].reset_index()
dataset_model = v_hora_range.merge(linha_demanda_km, how="inner", on="linha_numero")

In [84]:
dataset_model

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_entrada,passageiro_matricula,Dia_Semana,Domingo,linha_km_programado
0,4,2021-08-31,7,81,Terça,0,1165.12
1,4,2021-08-31,8,105,Terça,0,1165.12
2,4,2021-08-31,9,133,Terça,0,1165.12
3,4,2021-08-31,10,134,Terça,0,1165.12
4,4,2021-08-31,11,138,Terça,0,1165.12
...,...,...,...,...,...,...,...
151568,991,2021-08-31,17,52,Terça,0,18.29
151569,991,2021-09-01,16,5,Quarta,0,18.29
151570,991,2021-09-01,17,50,Quarta,0,18.29
151571,991,2021-09-02,16,36,Quinta,0,18.29


Maior IDH de bairro em que a rota passa

In [85]:
rota_paradas_merged = rota_paradas.merge(paradas, how="inner", on="stop_id")
rota_paradas_merged = rota_paradas_merged.merge(rotas, how="inner", on="route_id")
rotas_paradas_clean = rota_paradas_merged[["route_id", "stop_sequence", "district", "city"]]
rotas_paradas_demanda = rotas_paradas_clean.merge(demanda_p_linha_dia, how="inner", left_on="route_id", right_on="linha_numero")
rotas_paradas_demanda.drop(columns=["route_id", "stop_sequence"], inplace=True)
rotas_paradas_demanda = rotas_paradas_demanda[rotas_paradas_demanda["city"] == "Fortaleza"]

In [86]:
!pip install unidecode



In [87]:
# Retira acentos e barras (/ + termo) dos nomes dos bairros
from unidecode import unidecode

district = []
for index, row in rotas_paradas_demanda.iterrows():
  district_without_punt = unidecode(row["district"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
rotas_paradas_demanda["Bairro"] = district

district = []
for index, row in dados_bairros.iterrows():
  district_without_punt = unidecode(row["Bairros"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
dados_bairros["Bairro"] = district

In [88]:
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.lower()
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.replace("ç", "c")
dados_bairros["Bairro"] = dados_bairros["Bairro"].str.lower()
rotas_demanda_bairro = rotas_paradas_demanda.merge(dados_bairros, how="inner", left_on="Bairro", right_on="Bairro")
rotas_idh_max = rotas_demanda_bairro.groupby(["linha_numero"]).mean()[["IDH em 2010[8]"]].reset_index()
dataset_model = dataset_model.merge(rotas_idh_max, how="inner", on="linha_numero")

In [89]:
dataset_model

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_entrada,passageiro_matricula,Dia_Semana,Domingo,linha_km_programado,IDH em 2010[8]
0,4,2021-08-31,7,81,Terça,0,1165.120000,0.465660
1,4,2021-08-31,8,105,Terça,0,1165.120000,0.465660
2,4,2021-08-31,9,133,Terça,0,1165.120000,0.465660
3,4,2021-08-31,10,134,Terça,0,1165.120000,0.465660
4,4,2021-08-31,11,138,Terça,0,1165.120000,0.465660
...,...,...,...,...,...,...,...,...
135570,920,2021-09-29,19,51,Quarta,0,1179.797143,0.291214
135571,920,2021-09-29,20,29,Quarta,0,1179.797143,0.291214
135572,920,2021-09-29,21,15,Quarta,0,1179.797143,0.291214
135573,920,2021-09-29,22,16,Quarta,0,1179.797143,0.291214


Convertendo Hora Range e Linha Numero para Variavel Numérica

In [90]:
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["passageiro_hora_entrada"])], axis = 1)
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["linha_numero"])], axis = 1)

Limpeza de colunas

In [91]:
dataset_model.drop(columns=["Dia_Semana", "movimento_diario_data_mov", "linha_numero", "passageiro_hora_entrada"], inplace=True)

Renomeando colunas para facilitar tratativa

In [92]:
dataset_model.rename(columns={"passageiro_hora_range": "hora_range", 
                              "passageiro_matricula": "demanda",
                              "linha_km_programado": "linha_km",
                              "IDH em 2010[8]": "idh_max"}, inplace=True)

Removendo valores nulos

In [93]:
dataset_model['demanda'] = dataset_model['demanda'].replace(0, np.nan)
dataset_model.dropna(inplace=True)

In [94]:
dataset_model.describe()

Unnamed: 0,demanda,Domingo,linha_km,idh_max,0,1,2,3,4,5,...,833,835,836,841,855,901,906,907,913,920
count,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,...,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0,135575.0
mean,92.350183,0.116275,762.906244,0.366334,0.013269,0.001903,0.001851,0.002508,0.032189,0.050939,...,0.002065,0.002183,0.003762,0.004278,0.004529,0.0043,0.004426,0.004212,0.004374,0.004573
std,124.703214,0.320556,632.339181,0.126221,0.114427,0.043582,0.042988,0.050016,0.176502,0.219873,...,0.045399,0.046675,0.061218,0.065267,0.067144,0.065435,0.066378,0.064761,0.065991,0.06747
min,1.0,0.0,63.195714,0.068821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,433.388571,0.277941,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.0,0.0,573.446897,0.344108,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,111.0,0.0,844.2,0.451804,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1242.0,1.0,5484.445714,0.7178,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [95]:
dataset_model.corr()

Unnamed: 0,demanda,Domingo,linha_km,idh_max,0,1,2,3,4,5,...,833,835,836,841,855,901,906,907,913,920
demanda,1.000000,-0.163853,0.501611,0.204528,-0.078494,-0.026327,-0.029477,-0.034272,-0.093188,0.036182,...,-0.008355,-0.031533,-0.023468,-0.013833,0.104891,-0.025419,0.048804,-0.019578,-0.013252,0.015358
Domingo,-0.163853,1.000000,-0.007367,-0.023136,-0.002650,0.001584,0.000436,-0.002546,-0.009442,-0.004709,...,-0.016501,-0.016967,-0.001240,0.002313,0.003978,0.001129,0.003548,0.003413,0.002807,0.002697
linha_km,0.501611,-0.007367,1.000000,0.190563,0.003852,-0.040831,-0.043581,-0.035030,0.007905,0.000879,...,-0.026800,-0.040445,-0.051848,-0.036764,0.070064,-0.030236,0.145697,-0.031694,-0.018395,0.044686
idh_max,0.204528,-0.023136,0.190563,1.000000,0.007472,0.005047,0.004824,0.002328,-0.039592,-0.003653,...,0.081343,0.095861,-0.007952,0.084993,0.068100,0.182993,0.044342,0.140437,0.013599,-0.040339
0,-0.078494,-0.002650,0.003852,0.007472,1.000000,-0.005064,-0.004994,-0.005815,-0.021149,-0.026866,...,-0.005276,-0.005424,-0.007126,0.005238,0.010419,0.012081,-0.007732,-0.006546,-0.005733,0.019846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,-0.025419,0.001129,-0.030236,0.182993,0.012081,-0.002870,-0.002830,-0.003295,-0.011985,0.000155,...,-0.002990,-0.003074,-0.004038,-0.004308,-0.004433,1.000000,-0.004382,-0.004274,-0.004356,-0.004454
906,0.048804,0.003548,0.145697,0.044342,-0.007732,-0.002911,-0.002871,-0.003343,0.006728,-0.000285,...,-0.003033,-0.003119,-0.004097,-0.004370,-0.004497,-0.004382,1.000000,-0.004336,-0.004419,-0.004519
907,-0.019578,0.003413,-0.031694,0.140437,-0.006546,-0.002840,-0.002801,-0.003261,-0.011860,0.000473,...,-0.002959,-0.003042,-0.003996,-0.004263,-0.004387,-0.004274,-0.004336,1.000000,-0.004311,-0.004408
913,-0.013252,0.002807,-0.018395,0.013599,-0.005733,-0.002894,-0.002855,-0.003323,0.001211,-0.000105,...,-0.003015,-0.003100,-0.004073,-0.004345,-0.004471,-0.004356,-0.004419,-0.004311,1.000000,-0.004493


# Pre Processamento

In [96]:
X = dataset_model.drop(columns=["demanda"]).values
Y = dataset_model["demanda"].values

Normalização dos Dados

In [97]:
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import Normalizer

standard = StandardScaler()

X = standard.fit_transform(X)

In [98]:
X

array([[-0.36273116,  0.63607515,  0.78692566, ..., -0.06503464,
        -0.06628106, -0.06777997],
       [-0.36273116,  0.63607515,  0.78692566, ..., -0.06503464,
        -0.06628106, -0.06777997],
       [-0.36273116,  0.63607515,  0.78692566, ..., -0.06503464,
        -0.06628106, -0.06777997],
       ...,
       [-0.36273116,  0.65928611, -0.59514483, ..., -0.06503464,
        -0.06628106, 14.75362175],
       [-0.36273116,  0.65928611, -0.59514483, ..., -0.06503464,
        -0.06628106, 14.75362175],
       [-0.36273116,  0.65928611, -0.59514483, ..., -0.06503464,
        -0.06628106, 14.75362175]])

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25, random_state=123)

# Modelagem

Linear Regression 

In [100]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
results = linear_regression.predict(X_test)

In [101]:
results

array([ 20.58362446,  -9.79137554, 176.39612446, ...,  41.45862446,
       153.20862446,  40.08362446])

In [102]:
mean_absolute_error(y_test, results)

46.17063399064713

In [103]:
np.sqrt(mean_squared_error(y_test, results))

72.57967951318832

In [104]:
r2_score(y_test, results)

0.6607424842908591

Lasso

In [52]:
lasso = Lasso()
lasso.fit(X_train, y_train)
results = lasso.predict(X_test)

In [53]:
mean_absolute_error(y_test, results)

41.0085875448097

In [54]:
np.sqrt(mean_squared_error(y_test, results))

65.23219036266454

In [55]:
r2_score(y_test, results)

0.6325769219957794

Ridge

In [56]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train, y_train)
results = ridge_classifier.predict(X_test)

In [57]:
mean_absolute_error(y_test, results)

50.884695312022494

In [58]:
np.sqrt(mean_squared_error(y_test, results))

94.5324196733207

In [59]:
r2_score(y_test, results)

0.22837896781251155

ElasticNet

In [60]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
results = elastic_net.predict(X_test)

In [61]:
mean_absolute_error(y_test, results)

41.709718782058566

In [62]:
np.sqrt(mean_squared_error(y_test, results))

69.67960988040947

In [63]:
r2_score(y_test, results)

0.5807684786837641