In [9]:
!pip install ace_tools



In [88]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

file_path = 'sales.csv'  # Update this path if necessary
sampled_data = pd.read_csv(file_path)

# Limitar el dataset a las primeras 1000 filas para reducir la carga
# sampled_data = sales_data.head(1000)  # Usar las primeras 1000 filas

sampled_data = sampled_data[sampled_data["open"] != 0]

# Encontrar los índices donde 'open' es 0
indices_a_eliminar = sampled_data[sampled_data["open"] == 0].index

print(sampled_data.shape)

# Usar drop para eliminar las filas con esos índices
# sampled_data = sampled_data.drop(indices_a_eliminar)

print(sampled_data.shape)

# Preparar el dataset
sales_data_cleaned = sampled_data.drop(columns=['Unnamed: 0', 'date', 'store_ID'])

# Dividir características (X) y objetivo (y)
# Filtrar las filas donde la columna 'open' no sea 0

X = sales_data_cleaned.drop(columns=['sales'])
y = sales_data_cleaned['sales']

# Identificar columnas categóricas y numéricas
categorical_cols = ['state_holiday']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Modificar el preprocesador para ignorar categorías desconocidas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# print(preprocessor)

# Configurar los modelos a probar
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'RandomForest': RandomForestRegressor(random_state=42, n_estimators=50, max_depth=10),
    'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=50, max_depth=5)
}

# Configurar KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Validación cruzada para cada modelo
results = []
for model_name, model in models.items():
    # Crear el pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocesamiento
        ('model', model)                # Modelo actual
    ])
    
    # Variables para guardar métricas
    r2_scores, mae_scores, mse_scores, rmse_scores = [], [], [], []
    
    # KFold manual
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Entrenar el modelo
        pipeline.fit(X_train, y_train)
        
        # Hacer predicciones
        y_pred = pipeline.predict(X_test)
        
        # Calcular métricas
        r2_scores.append(r2_score(y_test, y_pred))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        mse_scores.append(mean_squared_error(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    
    # Guardar resultados promediados
    results.append({
        'Model': model_name,
        'R² Mean': np.mean(r2_scores),
        'R² Std': np.std(r2_scores),
        'MAE Mean': np.mean(mae_scores),
        'MAE Std': np.std(mae_scores),
        'MSE Mean': np.mean(mse_scores),
        'MSE Std': np.std(mse_scores),
        'RMSE Mean': np.mean(rmse_scores),
        'RMSE Std': np.std(rmse_scores)
    })

# Convertir resultados a un DataFrame
results_df = pd.DataFrame(results)

# Mostrar resultados en consola
print(results_df)

# Mostrar resultados

# Identificar el mejor modelo basado en R² promedio
best_model = results_df.sort_values(by='R² Mean', ascending=False).iloc[0]
print(f"\nEl mejor modelo es: {best_model['Model']} con un R² Mean de {best_model['R² Mean']:.4f}.")

(532016, 10)
(532016, 10)
              Model   R² Mean    R² Std     MAE Mean   MAE Std      MSE Mean  \
0  LinearRegression  0.729285  0.000754  1154.467371  0.380131  2.610379e+06   
1      DecisionTree  0.766965  0.000765  1079.736940  0.920470  2.247050e+06   
2      RandomForest  0.769846  0.000931  1075.785378  0.764975  2.219272e+06   
3  GradientBoosting  0.769875  0.000875  1076.410304  0.870689  2.218996e+06   

        MSE Std    RMSE Mean  RMSE Std  
0  11216.224964  1615.662854  3.473628  
1  11203.424080  1499.011652  3.736178  
2  12793.182835  1489.716007  4.294148  
3  12236.448037  1489.623735  4.108620  

El mejor modelo es: GradientBoosting con un R² Mean de 0.7699.


In [31]:
print(X_train.shape)
print(X_test.shape)

(427227, 6)
(213613, 6)


In [107]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

print('Running ....')

# Leer el archivo de datos
file_path = 'sales.csv'  # Cambia esta ruta si es necesario
sampled_data = pd.read_csv(file_path)

# Convertir 'date' a formato de fecha y crear columnas 'year' y 'month'
sampled_data['date'] = pd.to_datetime(sampled_data['date'])  # Asegurar que la columna es de tipo datetime
sampled_data['year'] = sampled_data['date'].dt.year
sampled_data['month'] = sampled_data['date'].dt.month

# Eliminar la columna original 'date'
sampled_data = sampled_data.drop(columns=['date'])

# Limpiar el dataset eliminando filas donde 'open' sea 0
sampled_data = sampled_data[sampled_data["open"] != 0]

# Preparar el dataset
sales_data_cleaned = sampled_data.drop(columns=['Unnamed: 0', 'store_ID'])

# Dividir características (X) y objetivo (y)
X = sales_data_cleaned.drop(columns=['sales'])
y = sales_data_cleaned['sales']

print(X.head())
print(X.shape)

# Identificar columnas categóricas y numéricas
categorical_cols = ['state_holiday']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Modificar el preprocesador para ignorar categorías desconocidas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# Configurar los modelos a probar
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'RandomForest': RandomForestRegressor(random_state=42, n_estimators=50, max_depth=10),
    'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=50, max_depth=5)
}

# Dividir los datos en entrenamiento y prueba (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluar los modelos
results = []
for model_name, model in models.items():
    print(f'Testing ... {model_name}')
    
    # Crear el pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocesamiento
        ('model', model)                # Modelo actual
    ])
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = pipeline.predict(X_test)
    
    # Calcular métricas
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Guardar resultados
    results.append({
        'Model': model_name,
        'R²': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })

# Convertir resultados a un DataFrame
results_df = pd.DataFrame(results)

# Mostrar resultados en consola
print(results_df)

# Identificar el mejor modelo basado en R²
best_model = results_df.sort_values(by='R²', ascending=False).iloc[0]
print(f"\nEl mejor modelo es: {best_model['Model']} con un R² de {best_model['R²']:.4f}.")

Running ....
   day_of_week  nb_customers_on_day  open  promotion state_holiday  \
0            4                  517     1          0             0   
1            6                  694     1          0             0   
2            4                  970     1          1             0   
3            2                  473     1          1             0   
4            4                 1068     1          1             0   

   school_holiday  year  month  
0               0  2013      4  
1               0  2015      4  
2               0  2013      8  
3               0  2013      5  
4               0  2013     10  
(532016, 8)
              Model        R²          MAE           MSE         RMSE
0  LinearRegression  0.732382  1146.341032  2.572481e+06  1603.895469
1      DecisionTree  0.776483  1059.169180  2.148564e+06  1465.797954
2      RandomForest  0.780925  1051.481429  2.105860e+06  1451.158177
3  GradientBoosting  0.781277  1050.939280  2.102481e+06  1449.993606

El me

In [95]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Leer el archivo de datos
file_path = 'sales.csv'  # Cambia esta ruta si es necesario
sampled_data = pd.read_csv(file_path)

# Limpiar el dataset eliminando filas donde 'open' sea 0
sampled_data = sampled_data[sampled_data["open"] != 0]

# Preparar el dataset
sales_data_cleaned = sampled_data.drop(columns=['Unnamed: 0', 'date', 'store_ID'])

# Dividir características (X) y objetivo (y)
X = sales_data_cleaned.drop(columns=['sales'])
y = sales_data_cleaned['sales']

print(X.head())
print(X.shape)

# Identificar columnas categóricas y numéricas
categorical_cols = ['state_holiday']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Modificar el preprocesador para ignorar categorías desconocidas
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# Configurar los modelos a probar
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'RandomForest': RandomForestRegressor(random_state=42, n_estimators=50, max_depth=10),
    'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=50, max_depth=5)
}

# Dividir los datos en entrenamiento y prueba (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluar los modelos
results = []
for model_name, model in models.items():
    # Crear el pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocesamiento
        ('model', model)                # Modelo actual
    ])
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = pipeline.predict(X_test)
    
    # Calcular métricas
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Guardar resultados
    results.append({
        'Model': model_name,
        'R²': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })

# Convertir resultados a un DataFrame
results_df = pd.DataFrame(results)

# Mostrar resultados en consola
print(results_df)

# Identificar el mejor modelo basado en R²
best_model = results_df.sort_values(by='R²', ascending=False).iloc[0]
print(f"\nEl mejor modelo es: {best_model['Model']} con un R² de {best_model['R²']:.4f}.")

   day_of_week  nb_customers_on_day  open  promotion state_holiday  \
0            4                  517     1          0             0   
1            6                  694     1          0             0   
2            4                  970     1          1             0   
3            2                  473     1          1             0   
4            4                 1068     1          1             0   

   school_holiday  
0               0  
1               0  
2               0  
3               0  
4               0  
(532016, 6)
              Model        R²          MAE           MSE         RMSE
0  LinearRegression  0.729652  1151.563125  2.598726e+06  1612.056417
1      DecisionTree  0.768209  1075.829344  2.228092e+06  1492.679600
2      RandomForest  0.770527  1071.526267  2.205810e+06  1485.196829
3  GradientBoosting  0.770158  1072.531455  2.209363e+06  1486.392612

El mejor modelo es: RandomForest con un R² de 0.7705.


In [133]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Cargar el dataset
file_path = 'sales.csv'  # Actualiza esta ruta si es necesario
sampled_data = pd.read_csv(file_path)

# Convertir 'date' a formato de fecha y crear columnas 'year' y 'month'
sampled_data['date'] = pd.to_datetime(sampled_data['date'])  # Asegurar que la columna es de tipo datetime
sampled_data['year'] = sampled_data['date'].dt.year
sampled_data['month'] = sampled_data['date'].dt.month

# Eliminar la columna original 'date'
sampled_data = sampled_data.drop(columns=['date'])

# Limpiar el dataset eliminando filas donde 'open' sea 0
sampled_data = sampled_data[sampled_data["open"] != 0]

# Limpiar y preparar el dataset
sales_data_cleaned = sampled_data.drop(columns=['Unnamed: 0',  'store_ID'])

# Dividir características (X) y objetivo (y)
X = sales_data_cleaned.drop(columns=['sales'])
y = sales_data_cleaned['sales']

# Identificar columnas categóricas y numéricas
categorical_cols = ['state_holiday']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocesador de datos
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# Configurar Gradient Boosting como único modelo
model = GradientBoostingRegressor(random_state=42, n_estimators=50, max_depth=5)

# Crear el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocesamiento
    ('model', model)                # Modelo
])

# Configurar KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Variables para guardar métricas
r2_scores, mae_scores, mse_scores, rmse_scores = [], [], [], []

# Validación cruzada con KFold
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Hacer predicciones
    y_pred = pipeline.predict(X_test)
    
    # Calcular métricas
    r2_scores.append(r2_score(y_test, y_pred))
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    mse_scores.append(mean_squared_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))

# Promediar métricas
results = {
    'R² Mean': np.mean(r2_scores),
    'R² Std': np.std(r2_scores),
    'MAE Mean': np.mean(mae_scores),
    'MAE Std': np.std(mae_scores),
    'MSE Mean': np.mean(mse_scores),
    'MSE Std': np.std(mse_scores),
    'RMSE Mean': np.mean(rmse_scores),
    'RMSE Std': np.std(rmse_scores)
}

# Mostrar resultados
results_df = pd.DataFrame([results])
print(results_df)

# Resultados finales
print(f"\nGradient Boosting Performance:")
print(f"R² Mean: {results['R² Mean']:.4f}")
print(f"MAE Mean: {results['MAE Mean']:.2f}")
print(f"MSE Mean: {results['MSE Mean']:.2f}")
print(f"RMSE Mean: {results['RMSE Mean']:.2f}")

    R² Mean    R² Std     MAE Mean   MAE Std      MSE Mean      MSE Std  \
0  0.780741  0.000712  1054.776943  1.070305  2.114220e+06  10446.51399   

     RMSE Mean  RMSE Std  
0  1454.031255  3.591247  

Gradient Boosting Performance:
R² Mean: 0.7807
MAE Mean: 1054.78
MSE Mean: 2114219.79
RMSE Mean: 1454.03


# FIRST FEATURE ENGINEERING AND SCALING

In [212]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
file_path = 'sales.csv'  # Change this path if necessary
data = pd.read_csv(file_path)

# Ensure the 'date' column is of datetime type
data['date'] = pd.to_datetime(data['date'])

# Create 'year' and 'month' columns based on 'date'
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

# Drop the 'date' column
data = data.drop(columns=['date'])

# Apply OneHotEncoder to the 'state_holiday' column
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Using drop='first' to avoid multicollinearity
encoded_state_holiday = encoder.fit_transform(data[['state_holiday']])

# Convert the encoded values into a DataFrame
encoded_columns = [f"state_holiday_{category}" for category in encoder.categories_[0][1:]]  # Skip the first category
encoded_df = pd.DataFrame(encoded_state_holiday, columns=encoded_columns)

# Concatenate the original DataFrame with the encoded columns
data = pd.concat([data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Drop the original 'state_holiday' column
data.drop(columns=['state_holiday'],inplace=True)

# Verify the results
print(data.head())
print(f"\nNew columns: {encoded_columns}")

   Unnamed: 0  store_ID  day_of_week  nb_customers_on_day  open  promotion  \
0      425390       366            4                  517     1          0   
1      291687       394            6                  694     1          0   
2      411278       807            4                  970     1          1   
3      664714       802            2                  473     1          1   
4      540835       726            4                 1068     1          1   

   school_holiday  sales  year  month  state_holiday_a  state_holiday_b  \
0               0   4422  2013      4              0.0              0.0   
1               0   8297  2015      4              0.0              0.0   
2               0   9729  2013      8              0.0              0.0   
3               0   6513  2013      5              0.0              0.0   
4               0  10882  2013     10              0.0              0.0   

   state_holiday_c  
0              0.0  
1              0.0  
2              0.

# PIPELINE MODEL TRAINING, TEST, EVALUATION AND SELECTION

In [191]:
# Split the features (X) and the target variable (y)
X = data.drop(columns=['sales'])
y = data['sales']

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models to test
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42, n_estimators=50),
    'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=50)
}

# Evaluate the models
results = []
for model_name, model in models.items():
    print(f'Testing ... {model_name} ...', end='')
    # Create the pipeline with scaling and model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Data scaling
        ('model', model)               # Current model
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # Save the model's results
    results.append({
        'Model': model_name,
        'R2': r2,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    })
    print('Finished!')

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display results sorted by R²
results_df = results_df.sort_values(by='R2', ascending=False)
print("\nResults:")
print(results_df)

# Identify the best model
best_model = results_df.iloc[0]
print(f"\nThe best model is: {best_model['Model']} with an R² of {best_model['R2']:.4f}.")

Testing ... LinearRegression ...Finished!
Testing ... DecisionTree ...Finished!
Testing ... RandomForest ...Finished!
Testing ... GradientBoosting ...Finished!

Results:
              Model        R2         MAE           MSE         RMSE
2      RandomForest  0.948563  537.109432  7.603213e+05   871.964071
1      DecisionTree  0.906621  672.005235  1.380300e+06  1174.861872
3  GradientBoosting  0.879426  901.077338  1.782280e+06  1335.020667
0  LinearRegression  0.852317  986.067394  2.183004e+06  1477.499245

The best model is: RandomForest with an R² of 0.9486.


# RandomForestRegressor - FIRST TRAINNING

In [236]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

data_cleaned = data.copy()

# Preprocess the dataset: drop irrelevant columns
data_cleaned = data_cleaned.drop(columns=['Unnamed: 0', 'store_ID'])

# Split features (X) and target (y)
X = data_cleaned.drop(columns=['sales'])
y = data_cleaned['sales']

# print(X.head(3))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(
    random_state=42,
    n_estimators=100,  # Number of trees
    max_depth=10,      # Maximum tree depth
    max_features='sqrt', # Maximum number of features per tree
    min_samples_split=2, # Minimum number of samples required to split a node
    min_samples_leaf=1   # Minimum number of samples in a leaf
)

print('\nStarting the RandomForestRegressor trainning')

model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print performance metrics
print("\nTest Set Results:")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")


Starting the RandomForestRegressor trainning

Test Set Results:
R²: 0.8737
MAE: 902.17
MSE: 1867125.23
RMSE: 1366.43


# RandomForestRegressor - FIRST OPTIMIZATION ATTEMPT

In [177]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Separar X (variables independientes) e y (variable dependiente)
X = data.drop(columns=['sales'])  # Asume que 'sales' es la variable dependiente
y = data['sales']

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Configurar los hiperparámetros para GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Crear el modelo base
rf = RandomForestRegressor(random_state=42)

print('Starting the Optimization ...', end='')

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='r2')

# Ajustar el modelo
grid_search.fit(X_train, y_train)

print('Optimization Finished!')

# Mejor modelo encontrado
best_rf = grid_search.best_estimator_
print("\nMejores hiperparámetros:", grid_search.best_params_)

# Predecir en el conjunto de prueba
y_pred = best_rf.predict(X_test)

# Calcular métricas de evaluación
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"\nR²: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

# Opcional: Guardar el modelo entrenado
import joblib
joblib.dump(best_rf, 'best_random_forest_model.pkl')

Starting the Optimization ...Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 2.2min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimat



[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.8min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 3.3min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time= 2.2min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 3.3min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 3.4min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.8min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 3.2min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time= 3.3min
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=4, mi

['best_random_forest_model.pkl']