<a href="https://colab.research.google.com/github/joao-fcosta/API_IA/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd

from google.colab import drive

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [12]:
# =============================
# 2. Carregar Dataset
# =============================
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/dataset/API_IA/FuelConsumptionCo2.csv')
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [13]:
# 2) SELECIONAR FEATURES E ALVO
features = ["ENGINESIZE", "CYLINDERS", "VEHICLECLASS", "TRANSMISSION", "FUELTYPE"]
target = "FUELCONSUMPTION_COMB"

data = df[features + [target]].dropna()

X = data[features]
y = data[target]

X.head(), y.head()


(   ENGINESIZE  CYLINDERS VEHICLECLASS TRANSMISSION FUELTYPE
 0         2.0          4      COMPACT          AS5        Z
 1         2.4          4      COMPACT           M6        Z
 2         1.5          4      COMPACT          AV7        Z
 3         3.5          6  SUV - SMALL          AS6        Z
 4         3.5          6  SUV - SMALL          AS6        Z,
 0     8.5
 1     9.6
 2     5.9
 3    11.1
 4    10.6
 Name: FUELCONSUMPTION_COMB, dtype: float64)

In [14]:
# 3) TREINO / TESTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
# 4) PRÉ-PROCESSAMENTO

numeric_features = ["ENGINESIZE", "CYLINDERS"]
categorical_features = ["VEHICLECLASS", "TRANSMISSION", "FUELTYPE"]

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)



In [16]:
# 5) DEFINIR MODELOS

ridge_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", Ridge())
])

rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

gb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])


In [17]:
# 4) GRIDS DE HIPERPARÂMETROS (VERSÃO ENXUTA)

param_grid_ridge = {
    "model__alpha": [0.1, 1.0, 10.0]
}

param_grid_rf = {
    "model__n_estimators": [300],        # fixo
    "model__max_depth": [None, 10],      # 2 opções
    "model__min_samples_split": [2, 5]   # 2 opções
}

param_grid_gb = {
    "model__n_estimators": [200],        # fixo
    "model__learning_rate": [0.05, 0.1], # 2 opções
    "model__max_depth": [2, 3]           # 2 opções
}


''' param_grid_ridge = {
    "model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}

param_grid_rf = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [None, 8, 12],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2"]
}

param_grid_gb = {
    "model__n_estimators": [100, 300, 500],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_depth": [2, 3, 4],
    "model__subsample": [0.8, 1.0],
    "model__min_samples_leaf": [1, 3, 5]
} '''


' param_grid_ridge = {\n    "model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]\n}\n\nparam_grid_rf = {\n    "model__n_estimators": [200, 400, 600],\n    "model__max_depth": [None, 8, 12],\n    "model__min_samples_split": [2, 5, 10],\n    "model__min_samples_leaf": [1, 2, 4],\n    "model__max_features": ["sqrt", "log2"]\n}\n\nparam_grid_gb = {\n    "model__n_estimators": [100, 300, 500],\n    "model__learning_rate": [0.03, 0.05, 0.1],\n    "model__max_depth": [2, 3, 4],\n    "model__subsample": [0.8, 1.0],\n    "model__min_samples_leaf": [1, 3, 5]\n} '

In [18]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# RIDGE
grid_ridge = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param_grid_ridge,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

# RANDOM FOREST
grid_rf = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid_rf,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

# GRADIENT BOOSTING
grid_gb = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=param_grid_gb,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

grid_ridge.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)

print("Ridge - melhor alpha:", grid_ridge.best_params_, "RMSE (CV):", -grid_ridge.best_score_)
print("RF    - best params:",  grid_rf.best_params_,    "RMSE (CV):", -grid_rf.best_score_)
print("GB    - best params:",  grid_gb.best_params_,    "RMSE (CV):", -grid_gb.best_score_)


Ridge - melhor alpha: {'model__alpha': 1.0} RMSE (CV): 1.0205364358797213
RF    - best params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 300} RMSE (CV): 0.8990953360910758
GB    - best params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200} RMSE (CV): 0.9104372310340736


In [19]:
best_ridge = grid_ridge.best_estimator_
best_rf    = grid_rf.best_estimator_
best_gb    = grid_gb.best_estimator_

for nome, modelo in [("Ridge", best_ridge),
                     ("RandomForest", best_rf),
                     ("GradientBoosting", best_gb)]:
    y_pred = modelo.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2   = r2_score(y_test, y_pred)
    print(f"{nome} -> RMSE teste: {rmse:.3f} L/100km | R²: {r2:.3f}")


Ridge -> RMSE teste: 0.952 L/100km | R²: 0.927
RandomForest -> RMSE teste: 0.802 L/100km | R²: 0.949
GradientBoosting -> RMSE teste: 0.878 L/100km | R²: 0.938


In [20]:
# 10) ENSEMBLE COM VOTINGREGRESSOR

voting_reg = VotingRegressor(
    estimators=[
        ("ridge", best_ridge),
        ("rf",    best_rf),
        ("gb",    best_gb)
    ]
)

# Cross-val com RMSE
scores = cross_val_score(
    voting_reg,
    X, y,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

print("VotingRegressor - RMSE médio (CV):", -np.mean(scores))
print("VotingRegressor - RMSE desvio padrão:", np.std(-scores))

# Avaliação no teste
voting_reg.fit(X_train, y_train)
y_pred_voting = voting_reg.predict(X_test)
rmse_voting = np.sqrt(mean_squared_error(y_test, y_pred_voting))
r2_voting   = r2_score(y_test, y_pred_voting)

print("VotingRegressor - RMSE teste:", rmse_voting)
print("VotingRegressor - R² teste:", r2_voting)


VotingRegressor - RMSE médio (CV): 0.8356601634654612
VotingRegressor - RMSE desvio padrão: 0.017326715108260712
VotingRegressor - RMSE teste: 0.8030480208913442
VotingRegressor - R² teste: 0.9483768439279923


In [21]:
import joblib

joblib.dump(voting_reg, "modelo_combustivel.joblib")
print("Modelo salvo em modelo_combustivel.joblib")


Modelo salvo em modelo_combustivel.joblib
