In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "yield"], axis=1)
y_train = df_train["yield"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
predictors = list(X_train.columns)

predictors

['clonesize',
 'honeybee',
 'bumbles',
 'andrena',
 'osmia',
 'MaxOfUpperTRange',
 'MinOfUpperTRange',
 'AverageOfUpperTRange',
 'MaxOfLowerTRange',
 'MinOfLowerTRange',
 'AverageOfLowerTRange',
 'RainingDays',
 'AverageRainingDays',
 'fruitset',
 'fruitmass',
 'seeds']

In [5]:
vif_threshold = 5

while True:
    vifs = []
    for i in range(0, len(predictors)):
        y = X_train.loc[:, X_train.columns == predictors[i]]
        x = X_train.loc[:, X_train.columns != predictors[i]]
        model = sm.OLS(y, x)
        results = model.fit()
        rsq = results.rsquared
        vif = round(1 / (1 - rsq), 2)
        # print(f"R-Squared {predictors[i]} is {round(rsq, 2)} keeping other columns")
        # print(f"VIF of {predictors[i]} is {vif}")
        vifs.append(vif)

    max_vif = max(vifs)

    if max_vif > vif_threshold:
        max_index = vifs.index(max_vif)
        # print(max_vif)
        # print(vifs)
        # print(predictors)
        # print(predictors[max_index])
        del predictors[max_index]
        X_train = X_train.drop(X_train.columns[max_index], axis=1)
        X_test = X_test.drop(X_test.columns[max_index], axis=1)

    else:
        break

X_test.head(5)

Unnamed: 0,honeybee,andrena,RainingDays
0,0.25,0.25,24.0
1,0.25,0.75,1.0
2,0.25,0.63,16.0
3,0.5,0.38,16.0
4,0.75,0.25,24.0


In [9]:
def evaluate_model(model, X, y, n_splits=5):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    selected_features (Index): The selected feature names.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred = model.predict(X_test_cv)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 5 decimal places
    return [round(value, 3) for value in mae_scores]

In [10]:
# Initialize the models

models = {
    "Lasso": Lasso(),
    "LightGBM": lgb.LGBMRegressor(random_state=5),
    "RandomForest": RandomForestRegressor(random_state=5),
    "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
    "SVR": SVR(),
}

In [11]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(model, X_train, y_train)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # Train the model on the training set
    model.fit(X_train, y_train)

    # Predict for the test set
    y_test_pred = model.predict(X_test)

    df_test["yield"] = y_test_pred.round(3)

    # Save the output DataFrame to a CSV file
    df_test[["id", "yield"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")
    print()

Model: Lasso
MAE Scores: [897.993, 913.867, 917.31, 923.243, 916.466]
Average MAE: 913.776
Std Deviation: 8.467

Model: LightGBM
MAE Scores: [840.436, 858.286, 849.829, 855.983, 843.529]
Average MAE: 849.613
Std Deviation: 6.886

Model: RandomForest
MAE Scores: [841.343, 862.207, 849.454, 857.309, 843.656]
Average MAE: 850.794
Std Deviation: 7.934

Model: CatBoost
MAE Scores: [841.508, 863.582, 848.911, 856.583, 844.6]
Average MAE: 851.037
Std Deviation: 8.061

Model: SVR
MAE Scores: [913.434, 931.468, 931.426, 934.424, 930.801]
Average MAE: 928.311
Std Deviation: 7.544

