In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.columns

Index(['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
       'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds',
       'yield'],
      dtype='object')

In [3]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "yield"], axis=1)
y_train = df_train["yield"]

X_test = df_test.drop(["id"], axis=1)

In [4]:
column_to_drop = ["MinOfUpperTRange", "AverageOfUpperTRange", "MaxOfLowerTRange", "MinOfLowerTRange", "AverageOfLowerTRange","AverageRainingDays"]

In [5]:
# Remove the columns that are perfectly correlated with each other from train and test

X_train = X_train.drop(column_to_drop, axis=1)
X_test = X_test.drop(column_to_drop, axis=1)

In [6]:
def evaluate_model(X, y, rs, n_splits=5):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    selected_features (Index): The selected feature names.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        rs.fit(X_train_cv, y_train_cv)

        # Predict probabilities for the test (keep only the probability of the positive class)
        y_pred = rs.predict(X_test_cv)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 5 decimal places
    return [round(value, 3) for value in mae_scores]

In [7]:
# Define the hyperparameter grids

lasso_params = {
    "alpha": [1e-10, 0.1, 0.5, 1, 2, 5, 10]
}

lightgbm_params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9, 11],
    'num_leaves': [10, 20, 30, 40, 50],
    'feature_fraction': [0.1, 0.3, 0.5, 0.7, 0.9],
    'bagging_fraction': [0.5, 0.7, 0.9, 1.0, 1.1],
    'bagging_freq': [1, 2, 3, 4, 5],
    'min_data_in_leaf': [1, 5, 10, 20, 30],
    'min_gain_to_split': [0.1, 0.5, 1.0, 2.0, 5.0],
    'lambda_l1': [0, 0.001, 0.01, 0.1, 1.0],
    'lambda_l2': [0, 0.001, 0.01, 0.1, 1.0]
}

rf_params = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_leaf': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

catboost_params = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "depth": [3, 5, 7, 9],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "iterations": [30, 50, 100, 500, 1000],
}

In [8]:
# Define the parameter grid for each model

param_grids = {
    "Lasso": lasso_params,
    "LightGBM": lightgbm_params,
    "RandomForest": rf_params,
    "CatBoost": catboost_params,
}

In [9]:
# Initialize the models

models = {
    "Lasso": Lasso(random_state=5),
    "LightGBM": lgb.LGBMRegressor(random_state=5, n_jobs=-1),
    # "RandomForest": RandomForestRegressor(random_state=5, n_jobs=-1),
    "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
    # "SVR": SVR(),
}

In [10]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    #Initialize SFS with the current model
    sfs = SFS(model,
              k_features="best",
              forward=True,
              floating=False,
              scoring="neg_mean_absolute_error",
              cv=3,
              n_jobs=-1)
    
    # Perform SFS on the training data
    sfs = sfs.fit(X_train, y_train)

    # Get the selected features
    selected_features = X_train.columns[list(sfs.k_feature_idx_)]

    #Print the results for the current model
    print(f"Model: {name}")
    print(f"Selected features: {selected_features}")

    # Run RandomizedSearch on each model
    rs = RandomizedSearchCV(model, param_grids[name], cv=3, scoring="neg_mean_absolute_error", n_iter=100, n_jobs=-1)
    rs.fit(X_train[selected_features], y_train)
    best_params = rs.best_params_

    print(f"Best RandomizedSearch features: {rs.best_params_}")
    print(f"Best RandomizedSearch Score: {rs.best_score_}")
    
    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(X_train[selected_features], y_train, rs)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # # Train the model on the training set
    # rs.fit(X_train[selected_features], y_train)

    # Predict for the test set
    y_test_pred = rs.predict(X_test[selected_features])

    df_test["yield"] = y_test_pred.round(3)

    # Save the output DataFrame to a CSV file
    df_test[["id", "yield"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    # print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")

    # try:
    #     plt.figure(figsize=(10, 7))
    #     plt.plot(model.feature_importances_, label=name)
    #     plt.xticks(np.arange(X_train.shape[1]), X_train.columns.tolist(), rotation=90)
    #     plt.legend()
    
    # except AttributeError: # Incase the model does not have "feature_importances_"
    #     pass

    print()

Model: Lasso
Selected features: Index(['bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'RainingDays',
       'fruitset', 'seeds'],
      dtype='object')
Best RandomizedSearch features: {'alpha': 1e-10}
Best RandomizedSearch Score: -376.6603809600663
MAE Scores: [377.766, 384.3, 379.012, 373.334, 368.963]
Average MAE: 376.675
Std Deviation: 5.204

Model: LightGBM
Selected features: Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds'],
      dtype='object')
Best RandomizedSearch features: {'num_leaves': 30, 'min_gain_to_split': 1.0, 'min_data_in_leaf': 5, 'max_depth': 11, 'learning_rate': 0.05, 'lambda_l2': 0.01, 'lambda_l1': 1.0, 'feature_fraction': 0.9, 'bagging_freq': 1, 'bagging_fraction': 0.5}
Best RandomizedSearch Score: -355.11567895382103
MAE Scores: [348.989, 361.627, 352.795, 354.597, 351.5]
Average MAE: 353.902
Std Deviation: 4.273

Model: CatBoost
Selected features: Index(['honeybee', 'Ma

### NOTE
Roughly 30 minutes run time