In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "yield"], axis=1)
y_train = df_train["yield"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
predictors = list(X_train.columns)

predictors

['clonesize',
 'honeybee',
 'bumbles',
 'andrena',
 'osmia',
 'MaxOfUpperTRange',
 'MinOfUpperTRange',
 'AverageOfUpperTRange',
 'MaxOfLowerTRange',
 'MinOfLowerTRange',
 'AverageOfLowerTRange',
 'RainingDays',
 'AverageRainingDays',
 'fruitset',
 'fruitmass',
 'seeds']

In [5]:
def calculate_vif(df, predictors):
    """
    Calculates the Variance Inflation Factor (VIF) for the given predictor variables in a DataFrame.

    Parameters:
    -----------
    df (DataFrame): The DataFrame containing the predictor variables.
    predictors (list): A list of column names for the predictor variables.

    Returns:
    --------
    vif (DataFrame): A DataFrame containing the predictor variable names and their corresponding VIF values.
    """
    X = df[predictors]
    vif = pd.DataFrame()
    vif["variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [6]:
# Set the VIF threshold
vif_threshold = 11

# Remove features with VIF above the threshold one at a time
while True:
    vif = calculate_vif(X_train, predictors)
    max_vif = vif["VIF"].max()

    if max_vif <= vif_threshold:
        break

    # Identify the predictor with the highest VIF and remove it
    predictor_to_remove = vif.loc[vif["VIF"] == max_vif, "variable"].values[0]
    predictors.remove(predictor_to_remove)

In [7]:
# Set X_train and X_test to use only columns remaining after collinearity calculation

X_train = X_train[predictors]
X_test = X_test[predictors]

In [8]:
def evaluate_model(model, X, y, n_splits=5):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    selected_features (Index): The selected feature names.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred = model.predict(X_test_cv)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 5 decimal places
    return [round(value, 3) for value in mae_scores]

In [9]:
# Initialize the models

models = {
    "Lasso": Lasso(),
    "LightGBM": lgb.LGBMRegressor(random_state=5),
    "RandomForest": RandomForestRegressor(random_state=5),
    "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
    "SVR": SVR(),
}

In [10]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(model, X_train, y_train)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Predict for the test set
    y_test_pred = model.predict(X_test)

    df_test["yield"] = y_test_pred.round(3)

    # Save the output DataFrame to a CSV file
    df_test[["id", "yield"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")
    print()

Model: Lasso
MAE Scores: [827.479, 834.203, 826.284, 842.843, 825.666]
Average MAE: 831.295
Std Deviation: 6.529

Model: LightGBM
MAE Scores: [809.091, 818.55, 811.42, 822.542, 815.523]
Average MAE: 815.425
Std Deviation: 4.829

Model: RandomForest
MAE Scores: [810.053, 821.265, 813.503, 824.36, 818.189]
Average MAE: 817.474
Std Deviation: 5.160

Model: CatBoost
MAE Scores: [810.937, 821.369, 811.997, 822.878, 819.096]
Average MAE: 817.255
Std Deviation: 4.889

Model: SVR
MAE Scores: [882.668, 902.168, 896.376, 901.635, 892.227]
Average MAE: 895.015
Std Deviation: 7.170



### OBSERVATIONS
Rounding fruitset and fruitmass made the model worse. Discard the idea!!!