# Gradient Boosting

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

df =pd.read_csv("sin_outliers_rounded.csv")
# Total de filas en el DataFrame
n = len(df)

In [None]:
# División del conjunto de datos en 80% entrenamiento y 20% prueba
np.random.seed(622)
df_train, df_test = train_test_split(df, train_size=0.8, random_state=622)

# Convertir columnas de texto a tipo 'category'
fct_df_train = df_train.copy()
fct_df_test = df_test.copy()

for col in fct_df_train.select_dtypes(include="object").columns:
    fct_df_train[col] = fct_df_train[col].astype("category")
    fct_df_test[col] = fct_df_test[col].astype("category")

# Definir variables independientes (X) y dependiente (y)
features = ["Variedad", "Altura", "Cantidad (L)", "Temperatura", "pH", "Puntaje en taza"]
target = "Tiempo de fermentación"

# Convertir variables categóricas a variables dummy para el modelo
X_train = pd.get_dummies(fct_df_train[features], drop_first=True)
X_test = pd.get_dummies(fct_df_test[features], drop_first=True)
y_train = fct_df_train[target]
y_test = fct_df_test[target]

# Asegurar que las columnas de test coincidan con las de train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [None]:
# Reproducibilidad
np.random.seed(123)

# Concatenar los datasets de entrenamiento y prueba como en R
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

# Proporción de entrenamiento
train_fraction = len(X_train) / len(X_full)

# Entrenar el modelo Gradient Boosting con 10,000 árboles
gbm_model = GradientBoostingRegressor(
    loss="squared_error", 
    learning_rate=0.01, 
    n_estimators=10000, 
    max_depth=4, 
    min_samples_leaf=20, 
    subsample=0.5,
    random_state=123
)

# Entrenar con los datos completos
gbm_model.fit(X_full, y_full)

# Obtener la importancia de las características
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": gbm_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importance)

# Calcular error en cada iteración del modelo
errors = [mean_squared_error(y_test, y_pred) for y_pred in gbm_model.staged_predict(X_test)]
best_iter = np.argmin(errors) + 1  # Encontrar la mejor cantidad de árboles

print(f"Best iteration based on test data: {best_iter}")

# Generar predicciones usando la mejor cantidad de árboles
gbm_prediction = list(gbm_model.staged_predict(X_test))[best_iter - 1]

# Calcular RMSE
rmse_gbm = np.sqrt(mean_squared_error(y_test, gbm_prediction))

# Calcular R²
rsq_gbm = np.corrcoef(y_test, gbm_prediction)[0, 1] ** 2

print(f"The GBM model is on average {round(rmse_gbm, 2)} total points off when predicting new values. \nIts R squared value is {round(rsq_gbm, 2)}.")

# Graficar valores reales vs predichos
plt.figure(figsize=(8,6))
plt.scatter(y_test, gbm_prediction, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="dashed")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Gradient Boosted Tree Model: Actual vs Predicted Values")
plt.show()

## Tiempo < 24 H

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("sin_outliers_rounded.csv")
df = df[df["Tiempo de fermentación"] <= 24]
# Total de filas en el DataFrame
n = len(df)

# División del conjunto de datos en 80% entrenamiento y 20% prueba
np.random.seed(4)
df_train, df_test = train_test_split(df, train_size=0.8, random_state=4)

features = ["Variedad", "Altura", "Cantidad (L)", "Temperatura", "pH", "Puntaje en taza"]
target = "Tiempo de fermentación"

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Convertir columnas de texto a tipo 'category'
fct_df_train = df_train.copy()
fct_df_test = df_test.copy()

for col in fct_df_train.select_dtypes(include="object").columns:
    fct_df_train[col] = fct_df_train[col].astype("category")
    fct_df_test[col] = fct_df_test[col].astype("category")

# Definir variables independientes (X) y dependiente (y)
features = ["Variedad", "Altura", "Cantidad (L)", "Temperatura", "pH", "Puntaje en taza"]
target = "Tiempo de fermentación"

# Convertir variables categóricas a variables dummy para el modelo
X_train = pd.get_dummies(fct_df_train[features], drop_first=True)
X_test = pd.get_dummies(fct_df_test[features], drop_first=True)
y_train = fct_df_train[target]
y_test = fct_df_test[target]

# Asegurar que las columnas de test coincidan con las de train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Reproducibilidad
np.random.seed(28)

# Concatenar los datasets de entrenamiento y prueba como en R
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

# Proporción de entrenamiento
train_fraction = len(X_train) / len(X_full)

# Entrenar el modelo Gradient Boosting con 10,000 árboles
gbm_model = GradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.01,
    n_estimators=10000,
    max_depth=4,
    min_samples_leaf=20,
    subsample=0.5,
    random_state=28
)

# Entrenar con los datos completos
gbm_model.fit(X_full, y_full)

# Obtener la importancia de las características
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": gbm_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importance)

# Calcular error en cada iteración del modelo
errors = [mean_squared_error(y_test, y_pred) for y_pred in gbm_model.staged_predict(X_test)]
best_iter = np.argmin(errors) + 1  # Encontrar la mejor cantidad de árboles

print(f"Best iteration based on test data: {best_iter}")

# Generar predicciones usando la mejor cantidad de árboles
gbm_prediction = list(gbm_model.staged_predict(X_test))[best_iter - 1]

# Calcular RMSE
rmse_gbm = np.sqrt(mean_squared_error(y_test, gbm_prediction))

# Calcular R²
rsq_gbm = np.corrcoef(y_test, gbm_prediction)[0, 1] ** 2

print(f"The GBM model is on average {round(rmse_gbm, 2)} total points off when predicting new values. \nIts R squared value is {round(rsq_gbm, 2)}.")

# Graficar valores reales vs predichos
plt.figure(figsize=(8,6))
plt.scatter(y_test, gbm_prediction, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="dashed")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Gradient Boosted Tree Model: Actual vs Predicted Values")
plt.show()

### Guardar modelo onnx

In [None]:
import onnx
import pickle
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Guardar las columnas para usar en la predicción
with open("columnas_Xx.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)

# Definir forma de entrada (número de features)
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

# Convertir el modelo
onnx_model = convert_sklearn(gbm_model, initial_types=initial_type, target_opset=8)

# Guardar el modelo en formato ONNX
onnx.save_model(onnx_model, "modelo_TimeFx.onnx")

print("✅ Modelo exportado a ONNX correctamente.")
print("Archivo: modelo_TimeFx.onnx")
print(f"Columnas guardadas en columnas_Xx.pkl ({len(X_train.columns)} features)")