# 4. Treinamento dos Modelos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('../dataset/dados_tratados.csv').dropna(subset=['delivery_time'])
df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_time,order_status_approved,order_status_canceled,order_status_created,order_status_delivered,order_status_invoiced,order_status_processing,order_status_shipped,order_status_unavailable
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,-0.428624,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,0.094843,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,-0.32393,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,0.094843,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,-1.056785,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [3]:

#alvo
y = df['delivery_time']


# nao numerico ou irrelevante
colunas_ignoradas = ['order_id', 'customer_id', 'order_status',
                     'order_purchase_timestamp', 'order_approved_at',
                     'order_delivered_carrier_date', 'order_delivered_customer_date',
                     'order_estimated_delivery_date', 'delivery_time']
X = df.drop(columns=colunas_ignoradas)

modelos = {
    'Regressão Linear': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}
resultados = {}

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

# ajuste manual pq a lib eh um lixo........
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

resultados['Regressão Linear'] = {'RMSE': rmse, 'R²': r2}

In [6]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

resultados['Random Forest'] = {'RMSE': rmse, 'R²': r2}

In [7]:
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

resultados['KNN'] = {'RMSE': rmse, 'R²': r2}

In [8]:
# GridSearchCV para Random Forest (ajuste de hiperparâmetros)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

y_pred_best_rf = best_rf_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best_rf)
rmse_best = np.sqrt(mse)

r2_best = r2_score(y_test, y_pred_best_rf)

resultados['Random Forest (GridSearch)'] = {'RMSE': rmse_best, 'R²': r2_best}

In [9]:
print("RESULTADOS COMPARATIVOS\n")
for modelo, metrica in resultados.items():
    print(f"{modelo}")
    print(f"    RMSE: {metrica['RMSE']:.4f}")
    print(f"    R²: {metrica['R²']:.4f}\n")

RESULTADOS COMPARATIVOS

Regressão Linear
    RMSE: 1.0483
    R²: -0.0009

Random Forest
    RMSE: 1.0482
    R²: -0.0008

KNN
    RMSE: 1.2297
    R²: -0.3773

Random Forest (GridSearch)
    RMSE: 1.2297
    R²: -0.0008

