##  Modelos predictivos de Regresión

In [7]:
import pandas as pd

In [9]:
df_final= pd.read_csv("../dataset/df_final.csv")
df_final

Unnamed: 0,Year_sc,average_rain_fall_mm_per_year_sc,pesticides_tonnes_sc,avg_temp_sc,N_kg/ha_sc,Area_Albania,Area_Algeria,Area_Angola,Area_Argentina,Area_Armenia,...,pesticide_load_medium,continent_Africa,continent_America,continent_Asia,continent_Europe,continent_Oceania,pca1_scaled,pca2_scaled,pca3_scaled,hg/ha_yield
0,0.000000,0.449671,0.000329,0.508264,0.261000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.698031,0.055034,0.118221,36613
1,0.043478,0.449671,0.000329,0.473485,0.079300,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.703225,0.063098,0.126988,29068
2,0.086957,0.449671,0.000329,0.497590,0.052625,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.704007,0.058940,0.130428,24876
3,0.130435,0.449671,0.000329,0.497245,0.054200,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.703289,0.058074,0.131738,24185
4,0.173913,0.449671,0.000546,0.528581,0.045150,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.703551,0.052442,0.134614,25848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.826087,0.190028,0.008891,0.651171,0.039950,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.084811,-0.044914,0.252353,4642
3949,0.869565,0.190028,0.008987,0.673554,0.043650,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.084453,-0.049387,0.254322,8751
3950,0.913043,0.190028,0.009082,0.660124,0.035150,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.083827,-0.048007,0.255686,6568
3951,0.956522,0.190028,0.009178,0.651171,0.025050,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.083346,-0.047298,0.257271,7912


In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split

**Creamos una función para correr ambos modelos con distintos inputs**

In [43]:
def model_evaluation(X, y, model_type='linear_regression', test_size=0.2, random_state=42):
    # Selección del modelo
    if model_type == 'linear_regression':
        model = LinearRegression()
    elif model_type == 'random_forest':
        model = RandomForestRegressor(random_state=random_state)
    else:
        raise ValueError(f"Modelo no reconocido: {model_type}")
    
    # División en train y validación
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Entrenamiento
    model.fit(X_train, y_train)

    # Predicciones
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Evaluación con métricas de regresión
    mse = mean_squared_error(y_val, y_val_pred)
    mae = mean_absolute_error(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)

    print(f"Evaluación del modelo '{model_type}':")
    print(f"→ MSE: {mse:.2f}")
    print(f"→ MAE: {mae:.2f}")
    print(f"→ R2 Score: {r2:.2f}")

    return {
        'model': model,
        'X_train': X_train,
        'X_val': X_val,
        'y_train': y_train,
        'y_val': y_val,
        'y_train_pred': y_train_pred,
        'y_val_pred': y_val_pred,
        'mse': mse,
        'mae': mae,
        'r2': r2
    }

### - **Variables del PCA como input**

In [15]:
df_final.columns

Index(['Year_sc', 'average_rain_fall_mm_per_year_sc', 'pesticides_tonnes_sc',
       'avg_temp_sc', 'N_kg/ha_sc', 'Area_Albania', 'Area_Algeria',
       'Area_Angola', 'Area_Argentina', 'Area_Armenia',
       ...
       'pesticide_load_medium', 'continent_Africa', 'continent_America',
       'continent_Asia', 'continent_Europe', 'continent_Oceania',
       'pca1_scaled', 'pca2_scaled', 'pca3_scaled', 'hg/ha_yield'],
      dtype='object', length=107)

In [14]:
X=df_final[['pca1_scaled', 'pca2_scaled', 'pca3_scaled']]

In [16]:
y =df_final["hg/ha_yield"].values

In [45]:
model_list = ['linear_regression', 'random_forest']
results_dict = {}

for model_name in model_list:
    print(f"\n{'='*40}\nModelo: {model_name}\n{'='*40}")
    results = model_evaluation(X,  y, model_type=model_name)
    results_dict[model_name] = results


Modelo: linear_regression
Evaluación del modelo 'linear_regression':
→ MSE: 616220335.47
→ MAE: 18997.26
→ R2 Score: 0.10

Modelo: random_forest
Evaluación del modelo 'random_forest':
→ MSE: 90457585.00
→ MAE: 4329.74
→ R2 Score: 0.87


**El modelo Random Forest presenta un desempeño notablemente superior al de la regresión lineal: tanto el error cuadrático medio (MSE) como el error absoluto medio (MAE) son considerablemente menores. Además, el coeficiente de determinación (R²) del Random Forest alcanza un valor de 0.87, lo que indica que explica el 87% de la variabilidad en los datos, frente al escaso 10% que logra la regresión lineal.**

### - **Variables primarias como input**

In [46]:
X=df_final.drop(columns=['pca1_scaled', 'pca2_scaled', 'pca3_scaled', "hg/ha_yield"])

In [47]:
y =df_final["hg/ha_yield"].values

In [48]:
model_list = ['linear_regression', 'random_forest']
results_dict = {}

for model_name in model_list:
    print(f"\n{'='*40}\nModelo: {model_name}\n{'='*40}")
    results = model_evaluation(X,  y, model_type=model_name)
    results_dict[model_name] = results


Modelo: linear_regression
Evaluación del modelo 'linear_regression':
→ MSE: 63129454.35
→ MAE: 4991.21
→ R2 Score: 0.91

Modelo: random_forest
Evaluación del modelo 'random_forest':
→ MSE: 30327131.22
→ MAE: 2362.91
→ R2 Score: 0.96


**Como se puede observar, la incorporación de las variables primarias mejoró el ajuste del modelo tanto en la regresión lineal como en el Random Forest. Sin embargo, las métricas de desempeño obtenidas por el modelo Random Forest fueron consistentemente superiores a las de la regresión lineal** 

**Considero que las tres métricas son complementarias. En cuanto a los errores, el MSE penaliza con mayor intensidad los valores extremos, a diferencia del MAE, que representa el error medio absoluto de forma más robusta frente a outliers. Por lo tanto, ambas métricas aportan información valiosa. Por otro lado, el coeficiente de determinación R² expresa la proporción de la varianza de la variable objetivo que es explicada por el modelo, indicando una medida general del nivel de ajuste del modelo.**

### **Selección de hiperparámetros del mejor modelo**

In [49]:
X=df_final.drop(columns=['pca1_scaled', 'pca2_scaled', 'pca3_scaled', "hg/ha_yield"])

In [50]:
y =df_final["hg/ha_yield"].values

In [59]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.20, random_state=42)

In [51]:
model= RandomForestRegressor(random_state=42)

In [57]:
params={'criterion':["squared_error", "absolute_error", "friedman_mse"],
        'max_depth':[5, 10, 20, 50, 100],
        "min_samples_leaf":[5, 10, 50]}

In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [73]:
cv = GridSearchCV(model, params, cv=3,refit=True,n_jobs=-1, scoring='r2',)     
cv.fit(X_train, y_train)

In [74]:
cv.best_estimator_

In [75]:
best_model=RandomForestRegressor(max_depth=50, min_samples_leaf=5, random_state=42)

best_model.fit(X_train, y_train)

In [76]:
# Predicciones
y_train_pred = best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)

# Evaluación con métricas de regresión
mse = mean_squared_error(y_val, y_val_pred)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)


print(f"→ MSE: {mse:.2f}")
print(f"→ MAE: {mae:.2f}")
print(f"→ R2 Score: {r2:.2f}")

→ MSE: 48617706.59
→ MAE: 3142.38
→ R2 Score: 0.93


Busqueda aleatoria de hiperparámetros 

In [63]:
cv = RandomizedSearchCV(model, params, cv=3,refit=True,n_jobs=-1, scoring='r2',)     
cv.fit(X_train, y_train)

In [64]:
cv.best_estimator_

In [65]:
best_model=RandomForestRegressor(criterion='friedman_mse', max_depth=50,
                      min_samples_leaf=10, random_state=42)

best_model.fit(X_train, y_train)

In [68]:
# Predicciones
y_train_pred = best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)

# Evaluación con métricas de regresión
mse = mean_squared_error(y_val, y_val_pred)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)


print(f"→ MSE: {mse:.2f}")
print(f"→ MAE: {mae:.2f}")
print(f"→ R2 Score: {r2:.2f}")

→ MSE: 64717099.37
→ MAE: 4084.63
→ R2 Score: 0.91


**Las búsquedas de hiperparámetros no optimizaron el modelo, e incluso arrojaron resultados con métricas de desempeño más bajas. En este caso se empleó validación cruzada con cv=3 sobre el conjunto de entrenamiento para evaluar los hiperparámetros, con el objetivo de evitar el overfitting y obtener una estimación más robusta del rendimiento del modelo**
