In [445]:
import pandas as pd
import numpy as np

# Modelagem
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import ElasticNet 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Carregamento dos Dados

In [446]:
path = "../../data/processed/"
v = pd.read_csv(path + "V.csv")
fgr = pd.read_csv(path + "FGR.csv", decimal=",")
paradas = pd.read_csv(path + "paradas.csv")
rota_paradas = pd.read_csv(path + "rota_paradas.csv")
rotas = pd.read_csv(path + "rotas.csv")
dados_bairros = pd.read_csv(path + "dados_bairros.csv")

# Feature Enginerring

Construção de variáveis
- Demanda
- Tamanho da linha
- Horario
- IDH dos bairros
- É domingo?

In [447]:
demanda_p_linha_dia = v.groupby(["linha_numero", "movimento_diario_data_mov"]).count()["passageiro_matricula"].to_frame().reset_index()
demanda_p_linha_dia.rename(columns={"passageiro_matricula": "demanda_do_dia"}, inplace = True)

In [448]:
demanda_p_linha_dia

Unnamed: 0,linha_numero,movimento_diario_data_mov,demanda_do_dia
0,1,2021-08-29,232
1,1,2021-08-31,9096
2,1,2021-09-01,8888
3,1,2021-09-02,8244
4,1,2021-09-03,9020
...,...,...,...
2078,942,2021-09-05,2
2079,991,2021-08-31,76
2080,991,2021-09-01,55
2081,991,2021-09-02,55


In [449]:
# Demanda por intervalo de tempo
v["passageiro_data_hora"] = pd.to_datetime(v["passageiro_data_hora"])
v["passageiro_hora_entrada"] = v["passageiro_data_hora"].dt.hour
# v["passageiro_hora_range"] = pd.cut(v["passageiro_hora_entrada"], bins=[-1, 4, 6, 9, 12, 15, 18, 21, float('Inf')], labels=['00-3h59', '4-5h59', '6-8h59', '9-11h59', '12-14h59', '15-17h59', '18-20h59', '21-23h59'])
v_hora_range = v.groupby(["linha_numero", "movimento_diario_data_mov", "passageiro_hora_entrada"]).count()["passageiro_matricula"].to_frame().reset_index()
v_hora_range.dropna(inplace=True)

In [450]:
v_hora_range

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_entrada,passageiro_matricula
0,1,2021-08-29,6,4
1,1,2021-08-29,7,40
2,1,2021-08-29,8,31
3,1,2021-08-29,9,3
4,1,2021-08-29,10,24
...,...,...,...,...
37064,999,2021-09-05,14,48
37065,999,2021-09-05,15,22
37066,999,2021-09-05,16,59
37067,999,2021-09-05,17,102


In [451]:
# Dias da semana
days_week ={
    0: 'Segunda', 
    1: 'Terça', 
    2: 'Quarta', 
    3: 'Quinta', 
    4: 'Sexta',
    5: 'Sábado', 
    6: 'Domingo'
}

v_hora_range["movimento_diario_data_mov"] = pd.to_datetime(v_hora_range["movimento_diario_data_mov"])

def eh_domingo(dia_semana):
    if dia_semana == "Domingo":
        return 1
    else:
        return 0

v_hora_range["Dia_Semana"] = v_hora_range.movimento_diario_data_mov.dt.weekday
v_hora_range["Dia_Semana"] = v_hora_range.Dia_Semana.map(days_week)
v_hora_range["Domingo"] = v_hora_range["Dia_Semana"].map(eh_domingo)

 Variável de km da rota

In [452]:

linha_p_km_programado = fgr[["fechamento_data", "linha_numero", "linha_km_programado"]].drop_duplicates()
linha_p_km_programado = linha_p_km_programado[linha_p_km_programado.linha_km_programado != 0]
linha_demanda_km = demanda_p_linha_dia.merge(linha_p_km_programado, how="inner", left_on=["linha_numero", "movimento_diario_data_mov"], right_on=["linha_numero","fechamento_data"])
linha_demanda_km.drop(columns=["fechamento_data"], inplace=True)
linha_demanda_km = linha_demanda_km.groupby(["linha_numero"]).mean()[["linha_km_programado"]].reset_index()
dataset_model = v_hora_range.merge(linha_demanda_km, how="inner", on="linha_numero")

Maior IDH de bairro em que a rota passa

In [453]:
rota_paradas_merged = rota_paradas.merge(paradas, how="inner", on="stop_id")
rota_paradas_merged = rota_paradas_merged.merge(rotas, how="inner", on="route_id")
rotas_paradas_clean = rota_paradas_merged[["route_id", "stop_sequence", "district", "city"]]
rotas_paradas_demanda = rotas_paradas_clean.merge(demanda_p_linha_dia, how="inner", left_on="route_id", right_on="linha_numero")
rotas_paradas_demanda.drop(columns=["route_id", "stop_sequence"], inplace=True)
rotas_paradas_demanda = rotas_paradas_demanda[rotas_paradas_demanda["city"] == "Fortaleza"]

In [454]:
!pip install unidecode



In [455]:
# Retira acentos e barras (/ + termo) dos nomes dos bairros
from unidecode import unidecode

district = []
for index, row in rotas_paradas_demanda.iterrows():
  district_without_punt = unidecode(row["district"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
rotas_paradas_demanda["Bairro"] = district

district = []
for index, row in dados_bairros.iterrows():
  district_without_punt = unidecode(row["Bairros"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
dados_bairros["Bairro"] = district

In [456]:
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.lower()
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.replace("ç", "c")
dados_bairros["Bairro"] = dados_bairros["Bairro"].str.lower()
rotas_demanda_bairro = rotas_paradas_demanda.merge(dados_bairros, how="inner", left_on="Bairro", right_on="Bairro")
rotas_idh_max = rotas_demanda_bairro.groupby(["linha_numero"]).mean()[["IDH em 2010[8]"]].reset_index()
dataset_model = dataset_model.merge(rotas_idh_max, how="inner", on="linha_numero")

Convertendo Hora Range e Linha Numero para Variavel Numérica

In [457]:
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["passageiro_hora_entrada"])], axis = 1)
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["linha_numero"])], axis = 1)

Limpeza de colunas

In [458]:
dataset_model.drop(columns=["Dia_Semana", "movimento_diario_data_mov", "linha_numero", "passageiro_hora_entrada"], inplace=True)

Renomeando colunas para facilitar tratativa

In [459]:
dataset_model.rename(columns={"passageiro_hora_range": "hora_range", 
                              "passageiro_matricula": "demanda",
                              "linha_km_programado": "linha_km",
                              "IDH em 2010[8]": "idh_max"}, inplace=True)

Removendo valores nulos

In [460]:
dataset_model['demanda'] = dataset_model['demanda'].replace(0, np.nan)
dataset_model.dropna(inplace=True)

In [461]:
dataset_model.describe()

Unnamed: 0,demanda,Domingo,linha_km,idh_max,0,1,2,3,4,5,...,833,835,836,841,855,901,906,907,913,920
count,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,...,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0,31842.0
mean,91.838547,0.123893,812.73218,0.366907,0.013379,0.001821,0.001696,0.002355,0.032002,0.050845,...,0.002198,0.002198,0.003769,0.004271,0.00446,0.004302,0.004397,0.004177,0.004334,0.004554
std,122.493081,0.329465,699.700124,0.126333,0.114891,0.042641,0.041147,0.048476,0.176007,0.219684,...,0.046836,0.046836,0.061274,0.065215,0.066632,0.065453,0.066163,0.064495,0.065691,0.067329
min,1.0,0.0,62.36,0.068821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,437.43,0.277941,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,49.0,0.0,603.017778,0.347174,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,110.0,0.0,898.02,0.453955,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1182.0,1.0,5895.52,0.7178,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [462]:
dataset_model.corr()

Unnamed: 0,demanda,Domingo,linha_km,idh_max,0,1,2,3,4,5,...,833,835,836,841,855,901,906,907,913,920
demanda,1.000000,-0.169195,0.505521,0.203646,-0.079537,-0.026773,-0.028877,-0.033812,-0.095272,0.029404,...,-0.009682,-0.032340,-0.023564,-0.014885,0.106432,-0.025677,0.052545,-0.018273,-0.012773,0.014350
Domingo,-0.169195,1.000000,-0.008584,-0.027303,-0.002305,0.004056,0.003034,-0.000574,-0.007174,-0.005894,...,-0.017651,-0.017651,-0.001349,0.001682,0.004874,0.001495,0.003825,0.003728,0.002761,0.002882
linha_km,0.505521,-0.008584,1.000000,0.188221,0.006493,-0.030776,-0.040803,-0.029951,0.008936,0.000090,...,-0.028333,-0.040021,-0.050785,-0.037885,0.071766,-0.028988,0.140350,-0.029527,-0.016419,0.033296
idh_max,0.203646,-0.027303,0.188221,1.000000,0.009836,0.006566,0.005117,-0.001372,-0.038413,-0.003656,...,0.083642,0.095894,-0.008232,0.084552,0.067212,0.182583,0.043855,0.139437,0.013225,-0.040525
0,-0.079537,-0.002305,0.006493,0.009836,1.000000,-0.004974,-0.004799,-0.005658,-0.021173,-0.026952,...,-0.005466,-0.005466,-0.007162,0.009140,0.008616,0.013227,-0.007738,-0.007542,-0.007683,0.020544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,-0.025677,0.001495,-0.028988,0.182583,0.013227,-0.002808,-0.002709,-0.003194,-0.011952,0.000075,...,-0.003085,-0.003085,-0.004043,-0.004305,-0.004400,1.000000,-0.004368,-0.004257,-0.004337,-0.004446
906,0.052545,0.003825,0.140350,0.043855,-0.007738,-0.002839,-0.002739,-0.003229,0.006796,-0.000256,...,-0.003119,-0.003119,-0.004087,-0.004352,-0.004448,-0.004368,1.000000,-0.004304,-0.004384,-0.004495
907,-0.018273,0.003728,-0.029527,0.139437,-0.007542,-0.002767,-0.002669,-0.003147,-0.011776,0.000527,...,-0.003040,-0.003040,-0.003983,-0.004242,-0.004335,-0.004257,-0.004304,1.000000,-0.004273,-0.004380
913,-0.012773,0.002761,-0.016419,0.013225,-0.007683,-0.002818,-0.002719,-0.003206,0.001586,-0.000036,...,-0.003097,-0.003097,-0.004058,-0.004321,-0.004416,-0.004337,-0.004384,-0.004273,1.000000,-0.004462


# Pre Processamento

In [463]:
X = dataset_model.drop(columns=["demanda"]).values
Y = dataset_model["demanda"].values

Normalização dos Dados

In [464]:
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import Normalizer

standard = StandardScaler()

X = standard.fit_transform(X)

In [465]:
X

array([[-0.37604927,  0.50363483,  0.78169483, ..., -0.06476413,
        -0.06597547, -0.06763553],
       [-0.37604927,  0.50363483,  0.78169483, ..., -0.06476413,
        -0.06597547, -0.06763553],
       [-0.37604927,  0.50363483,  0.78169483, ..., -0.06476413,
        -0.06597547, -0.06763553],
       ...,
       [-0.37604927,  0.49227979, -0.59916705, ..., -0.06476413,
        -0.06597547, 14.78512766],
       [-0.37604927,  0.49227979, -0.59916705, ..., -0.06476413,
        -0.06597547, 14.78512766],
       [-0.37604927,  0.49227979, -0.59916705, ..., -0.06476413,
        -0.06597547, 14.78512766]])

In [466]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25, random_state=123)

# Modelagem

Linear Regression 

In [467]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
results = linear_regression.predict(X_test)

In [468]:
results

array([ 74.32733462, 422.20233462,  19.54608462, ...,  38.45233462,
       209.39764712,  57.45233462])

In [469]:
mean_absolute_error(y_test, results)

45.99514695727503

In [470]:
np.sqrt(mean_squared_error(y_test, results))

73.32281670023787

In [471]:
r2_score(y_test, results)

0.6522706166129528

Lasso

In [472]:
lasso = Lasso()
lasso.fit(X_train, y_train)
results = lasso.predict(X_test)

In [473]:
mean_absolute_error(y_test, results)

46.09621877285122

In [474]:
np.sqrt(mean_squared_error(y_test, results))

74.65387093722177

In [475]:
r2_score(y_test, results)

0.6395311230349284

Ridge

In [476]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train, y_train)
results = ridge_classifier.predict(X_test)

In [477]:
mean_absolute_error(y_test, results)

55.12787338274086

In [478]:
np.sqrt(mean_squared_error(y_test, results))

104.61194704665115

In [479]:
r2_score(y_test, results)

0.2921753531955599

ElasticNet

In [480]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
results = elastic_net.predict(X_test)

In [481]:
mean_absolute_error(y_test, results)

47.272592462491666

In [482]:
np.sqrt(mean_squared_error(y_test, results))

80.75118156158769

In [483]:
r2_score(y_test, results)

0.5782443767573351