In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "yield"], axis=1)
y_train = df_train["yield"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
# Round the fruitset, fruitmass in the df_train and df_test

X_train = X_train.round({"fruitset": 1, "fruitmass": 1})
X_test = X_test.round({"fruitset": 1, "fruitmass": 1})

In [5]:
def evaluate_model(model, X, y, n_splits=5):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    selected_features (Index): The selected feature names.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred = model.predict(X_test_cv)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 5 decimal places
    return [round(value, 3) for value in mae_scores]

In [6]:
# Initialize the models

models = {
    "Lasso": Lasso(),
    "LightGBM": lgb.LGBMRegressor(random_state=5),
    "RandomForest": RandomForestRegressor(random_state=5),
    "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
    "SVR": SVR(),
}

In [7]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(model, X_train, y_train)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Predict for the test set
    y_test_pred = model.predict(X_test)

    df_test["yield"] = y_test_pred.round(3)

    # Save the output DataFrame to a CSV file
    df_test[["id", "yield"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")
    print()

Model: Lasso
MAE Scores: [406.691, 420.053, 412.344, 407.151, 400.464]
Average MAE: 409.341
Std Deviation: 6.548

Model: LightGBM
MAE Scores: [376.567, 392.132, 377.721, 380.416, 376.692]
Average MAE: 380.706
Std Deviation: 5.879

Model: RandomForest
MAE Scores: [408.018, 426.278, 410.825, 419.324, 414.225]
Average MAE: 415.734
Std Deviation: 6.480

Model: CatBoost
MAE Scores: [374.308, 392.558, 380.293, 386.219, 379.863]
Average MAE: 382.648
Std Deviation: 6.226

Model: SVR
MAE Scores: [1018.568, 1025.958, 1021.869, 1028.879, 1020.611]
Average MAE: 1023.177
Std Deviation: 3.736



### OBSERVATIONS
Rounding fruitset and fruitmass made the model worse. Discard the idea!!!