In [1]:
import pandas as pd
import numpy as np

# Modelagem
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import ElasticNet 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Carregamento dos Dados

In [2]:
path = "../../data/processed/"
v = pd.read_csv(path + "V.csv")
fgr = pd.read_csv(path + "FGR.csv", decimal=",")
paradas = pd.read_csv(path + "paradas.csv")
rota_paradas = pd.read_csv(path + "rota_paradas.csv")
rotas = pd.read_csv(path + "rotas.csv")
dados_bairros = pd.read_csv(path + "dados_bairros.csv")

# Feature Enginerring

Construção de variáveis
- Demanda
- Tamanho da linha
- Horario
- IDH dos bairros
- É domingo?

In [3]:
demanda_p_linha_dia = v.groupby(["linha_numero", "movimento_diario_data_mov"]).count()["passageiro_matricula"].to_frame().reset_index()
demanda_p_linha_dia.rename(columns={"passageiro_matricula": "demanda_do_dia"}, inplace = True)

In [4]:
demanda_p_linha_dia

Unnamed: 0,linha_numero,movimento_diario_data_mov,demanda_do_dia
0,1,2021-08-29,232
1,1,2021-08-31,9096
2,1,2021-09-01,8888
3,1,2021-09-02,8244
4,1,2021-09-03,9020
...,...,...,...
2078,942,2021-09-05,2
2079,991,2021-08-31,76
2080,991,2021-09-01,55
2081,991,2021-09-02,55


In [5]:
# Demanda por intervalo de tempo
v["passageiro_data_hora"] = pd.to_datetime(v["passageiro_data_hora"])
v["passageiro_hora_entrada"] = v["passageiro_data_hora"].dt.hour
v["passageiro_hora_range"] = pd.cut(v["passageiro_hora_entrada"], bins=[-1, 4, 6, 9, 12, 15, 18, 21, float('Inf')], labels=['00-3h59', '4-5h59', '6-8h59', '9-11h59', '12-14h59', '15-17h59', '18-20h59', '21-23h59'])
v_hora_range = v.groupby(["linha_numero", "movimento_diario_data_mov", "passageiro_hora_range"]).count()["passageiro_matricula"].to_frame().reset_index()
v_hora_range.dropna(inplace=True)

In [6]:
v_hora_range

Unnamed: 0,linha_numero,movimento_diario_data_mov,passageiro_hora_range,passageiro_matricula
73,1,2021-08-29,4-5h59,4.0
74,1,2021-08-29,6-8h59,74.0
75,1,2021-08-29,9-11h59,97.0
76,1,2021-08-29,12-14h59,57.0
88,1,2021-08-31,00-3h59,150.0
...,...,...,...,...
47973,991,2021-08-31,15-17h59,75.0
47981,991,2021-09-01,15-17h59,55.0
47989,991,2021-09-02,15-17h59,55.0
48164,999,2021-09-05,12-14h59,82.0


In [7]:
# Dias da semana
days_week ={
    0: 'Segunda', 
    1: 'Terça', 
    2: 'Quarta', 
    3: 'Quinta', 
    4: 'Sexta',
    5: 'Sábado', 
    6: 'Domingo'
}

v_hora_range["movimento_diario_data_mov"] = pd.to_datetime(v_hora_range["movimento_diario_data_mov"])

def eh_domingo(dia_semana):
    if dia_semana == "Domingo":
        return 1
    else:
        return 0

v_hora_range["Dia_Semana"] = v_hora_range.movimento_diario_data_mov.dt.weekday
v_hora_range["Dia_Semana"] = v_hora_range.Dia_Semana.map(days_week)
v_hora_range["Domingo"] = v_hora_range["Dia_Semana"].map(eh_domingo)

 Variável de km da rota

In [8]:

linha_p_km_programado = fgr[["fechamento_data", "linha_numero", "linha_km_programado"]].drop_duplicates()
linha_p_km_programado = linha_p_km_programado[linha_p_km_programado.linha_km_programado != 0]
linha_demanda_km = demanda_p_linha_dia.merge(linha_p_km_programado, how="inner", left_on=["linha_numero", "movimento_diario_data_mov"], right_on=["linha_numero","fechamento_data"])
linha_demanda_km.drop(columns=["fechamento_data"], inplace=True)
linha_demanda_km = linha_demanda_km.groupby(["linha_numero"]).mean()[["linha_km_programado"]].reset_index()
dataset_model = v_hora_range.merge(linha_demanda_km, how="inner", on="linha_numero")

Maior IDH de bairro em que a rota passa

In [9]:
rota_paradas_merged = rota_paradas.merge(paradas, how="inner", on="stop_id")
rota_paradas_merged = rota_paradas_merged.merge(rotas, how="inner", on="route_id")
rotas_paradas_clean = rota_paradas_merged[["route_id", "stop_sequence", "district", "city"]]
rotas_paradas_demanda = rotas_paradas_clean.merge(demanda_p_linha_dia, how="inner", left_on="route_id", right_on="linha_numero")
rotas_paradas_demanda.drop(columns=["route_id", "stop_sequence"], inplace=True)
rotas_paradas_demanda = rotas_paradas_demanda[rotas_paradas_demanda["city"] == "Fortaleza"]

In [10]:
!pip install unidecode



In [11]:
# Retira acentos e barras (/ + termo) dos nomes dos bairros
from unidecode import unidecode

district = []
for index, row in rotas_paradas_demanda.iterrows():
  district_without_punt = unidecode(row["district"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
rotas_paradas_demanda["Bairro"] = district

district = []
for index, row in dados_bairros.iterrows():
  district_without_punt = unidecode(row["Bairros"])
  district_without_barra = district_without_punt.split("/")[0]
  if district_without_barra.startswith("Parque "):
    district_without_barra = district_without_barra.replace("Parque ", "")
  if district_without_barra.startswith("Vila "):
    district_without_barra = district_without_barra.replace("Vila ", "")
  if district_without_barra.startswith("Prefeito "):
    district_without_barra = district_without_barra.replace("Prefeito ", "")
  district.append(district_without_barra.strip())
dados_bairros["Bairro"] = district

In [12]:
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.lower()
rotas_paradas_demanda["Bairro"] = rotas_paradas_demanda["Bairro"].str.replace("ç", "c")
dados_bairros["Bairro"] = dados_bairros["Bairro"].str.lower()
rotas_demanda_bairro = rotas_paradas_demanda.merge(dados_bairros, how="inner", left_on="Bairro", right_on="Bairro")
rotas_idh_max = rotas_demanda_bairro.groupby(["linha_numero"]).max()[["IDH em 2010[8]"]].reset_index()
dataset_model = dataset_model.merge(rotas_idh_max, how="inner", on="linha_numero")

Convertendo Hora Range e Linha Numero para Variavel Numérica

In [13]:
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["passageiro_hora_range"])], axis = 1)
dataset_model = pd.concat([dataset_model, pd.get_dummies(dataset_model["linha_numero"])], axis = 1)

Limpeza de colunas

In [14]:
dataset_model.drop(columns=["Dia_Semana", "movimento_diario_data_mov", "linha_numero", "passageiro_hora_range"], inplace=True)

Renomeando colunas para facilitar tratativa

In [39]:
dataset_model.rename(columns={"passageiro_hora_range": "hora_range", 
                              "passageiro_matricula": "demanda",
                              "linha_km_programado": "linha_km",
                              "IDH em 2010[8]": "idh_max"}, inplace=True)

Removendo valores nulos

In [16]:
dataset_model['demanda'] = dataset_model['demanda'].replace(0, np.nan)
dataset_model.dropna(inplace=True)

In [40]:
dataset_model

Unnamed: 0,demanda,Domingo,linha_km,idh_max,00-3h59,4-5h59,6-8h59,9-11h59,12-14h59,15-17h59,...,833,835,836,841,855,901,906,907,913,920
0,319.0,0,1165.120,0.762,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,398.0,0,1165.120,0.762,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,557.0,0,1165.120,0.762,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,979.0,0,1165.120,0.762,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,76.0,0,1165.120,0.762,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12465,282.0,0,1157.175,0.530,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
12466,352.0,0,1157.175,0.530,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
12467,614.0,0,1157.175,0.530,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
12468,185.0,0,1157.175,0.530,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Pre Processamento

In [17]:
X = dataset_model.drop(columns=["demanda"]).values
Y = dataset_model["demanda"].values

Normalização dos Dados

In [18]:
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()

X = standard.fit_transform(X)

In [19]:
X

array([[-0.37452726,  0.50449666,  1.0069873 , ..., -0.06280869,
        -0.06594867, -0.06716425],
       [-0.37452726,  0.50449666,  1.0069873 , ..., -0.06280869,
        -0.06594867, -0.06716425],
       [-0.37452726,  0.50449666,  1.0069873 , ..., -0.06280869,
        -0.06594867, -0.06716425],
       ...,
       [-0.37452726,  0.49316187, -0.15392117, ..., -0.06280869,
        -0.06594867, 14.88887408],
       [-0.37452726,  0.49316187, -0.15392117, ..., -0.06280869,
        -0.06594867, 14.88887408],
       [-0.37452726,  0.49316187, -0.15392117, ..., -0.06280869,
        -0.06594867, 14.88887408]])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25, random_state=123)

# Modelagem

Linear Regression 

In [21]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
results = linear_regression.predict(X_test)

In [22]:
results

array([ 390.79321447, -254.18334803,   28.76587072, ...,   15.42993322,
         98.34008947,  286.45727697])

In [23]:
mean_absolute_error(y_test, results)

124.4057992931102

In [24]:
np.sqrt(mean_squared_error(y_test, results))

191.6927198648643

In [25]:
r2_score(y_test, results)

0.625795885012751

Lasso

In [26]:
lasso = Lasso()
lasso.fit(X_train, y_train)
results = lasso.predict(X_test)

In [27]:
mean_absolute_error(y_test, results)

123.35784751326581

In [28]:
np.sqrt(mean_squared_error(y_test, results))

191.41671716113615

In [29]:
r2_score(y_test, results)

0.626872681193259

Ridge

In [30]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train, y_train)
results = ridge_classifier.predict(X_test)

In [31]:
mean_absolute_error(y_test, results)

163.1555484284798

In [32]:
np.sqrt(mean_squared_error(y_test, results))

306.92846884963024

In [33]:
r2_score(y_test, results)

0.04066195976880327

ElasticNet

In [34]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
results = elastic_net.predict(X_test)

In [35]:
mean_absolute_error(y_test, results)

120.38080637700563

In [36]:
np.sqrt(mean_squared_error(y_test, results))

201.54985301290134

In [37]:
r2_score(y_test, results)

0.5863221272336225