In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.columns

Index(['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
       'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds',
       'yield'],
      dtype='object')

In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "yield"], axis=1)
y_train = df_train["yield"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
column_to_drop = ['MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange','AverageRainingDays']

In [5]:
# Remove the columns that are perfectly correlated with each other from train and test

X_train = X_train.drop(column_to_drop, axis=1)
X_test = X_test.drop(column_to_drop, axis=1)

In [6]:
def evaluate_model(model, X, y, n_splits=5):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    selected_features (Index): The selected feature names.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred = model.predict(X_test_cv)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 5 decimal places
    return [round(value, 3) for value in mae_scores]

In [7]:
# Initialize the models

models = {
    "Lasso": Lasso(),
    "LightGBM": lgb.LGBMRegressor(random_state=5),
    "RandomForest": RandomForestRegressor(random_state=5),
    "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
    # "SVR": SVR(),
}

In [8]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    #Initialize SFS with the current model
    sfs = SFS(model,
              k_features="best",
              forward=True,
              floating=False,
              scoring="neg_mean_absolute_error",
              cv=3,
              n_jobs=-1)
    
    # Perform SFS on the training data
    sfs = sfs.fit(X_train, y_train)

    # Get the selected features
    selected_features = X_train.columns[list(sfs.k_feature_idx_)]

    #Print the results for the current model
    print(f"Model: {name}")
    print(f"Selected features: {selected_features}")

    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(model, X_train[selected_features], y_train)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # Train the model on the training set
    model.fit(X_train[selected_features], y_train)

    # Predict for the test set
    y_test_pred = model.predict(X_test[selected_features])

    df_test["yield"] = y_test_pred.round(3)

    # Save the output DataFrame to a CSV file
    df_test[["id", "yield"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    # print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")

    # try:
    #     plt.figure(figsize=(10, 7))
    #     plt.plot(model.feature_importances_, label=name)
    #     plt.xticks(np.arange(X_train.shape[1]), X_train.columns.tolist(), rotation=90)
    #     plt.legend()
    
    # except AttributeError: # Incase the model does not have "feature_importances_"
    #     pass

    print()

Model: Lasso
Selected features: Index(['bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'RainingDays',
       'fruitset', 'seeds'],
      dtype='object')
MAE Scores: [379.78, 388.323, 381.612, 375.99, 371.867]
Average MAE: 379.514
Std Deviation: 5.531

Model: LightGBM
Selected features: Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds'],
      dtype='object')
MAE Scores: [350.629, 364.375, 354.089, 354.86, 349.107]
Average MAE: 354.612
Std Deviation: 5.326

Model: RandomForest
Selected features: Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'MaxOfUpperTRange',
       'RainingDays', 'fruitset', 'fruitmass', 'seeds'],
      dtype='object')
MAE Scores: [363.8, 374.241, 367.995, 370.352, 366.828]
Average MAE: 368.643
Std Deviation: 3.505

Model: CatBoost
Selected features: Index(['honeybee', 'MaxOfUpperTRange', 'fruitset', 'seeds'], dtype='object')
MAE Scores: [354.261, 367.668, 356.221, 355.9

### NOTE
Took 15 minutes for code to run