In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PolynomialFeatures
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import warnings
warnings.filterwarnings("ignore")

In [2]:
# To display all rows
pd.set_option('display.max_rows', None)

# To display all columns
pd.set_option('display.max_columns', None)

# To display entire contents of each cell (useful for long strings)
pd.set_option('display.max_colwidth', None)

In [3]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.columns

Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight', 'Age'],
      dtype='object')

In [4]:
df_test.columns

Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight'],
      dtype='object')

In [5]:
# Separate features and target for train and validation data

X_train = df_train.drop(["id", "Age"], axis=1)
y_train = df_train["Age"]

X_test = df_test.drop(["id"], axis=1)

In [6]:
label_encoder = LabelEncoder()
X_train["Sex"] = label_encoder.fit_transform(X_train["Sex"])
X_test["Sex"] = label_encoder.transform(X_test["Sex"])

In [7]:
X_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,1,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928
1,1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194
2,2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133
3,0,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885
4,1,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395


In [8]:
y_train.head()

0     9
1     8
2     9
3    11
4     8
Name: Age, dtype: int64

In [9]:
X_test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,1,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552
1,1,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893
2,0,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415
3,0,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676
4,1,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066


In [10]:
def generate_interactive_features(df, df_features):
    """
    Generate interaction features between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating interaction features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the interaction features only.
    """
    # Get the list of features to create interaction terms
    features = [col for col in df.columns if col in df_features]
    new_features = []

    # Create a new DataFrame to store only the new features
    df_new_features = pd.DataFrame(index=df.index)

    # Iterate through the features and create interaction terms
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            if i != j:
                # Generate a new feature name for the interaction term
                new_feature_name = f"{features[i]}_{features[j]}"
                
                # Create the interaction feature by multiplying the values of the two original features
                df_new_features[new_feature_name] = np.where(df[features[i]] == 0, 1e-10, df[features[i]]) * np.where(df[features[j]] == 0, 1e-10, df[features[j]])
                
                # Add the new feature name to the list of new features
                new_features.append(new_feature_name)
    
    return df_new_features

In [11]:
def generate_triple_interactive_features(df, df_features):
    """
    Generate interaction features between three of the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating interaction features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the interaction features added.
    """
    # Get the list of features to create interaction terms
    features = [col for col in df.columns if col in df_features]
    new_features = []

    # Create a new DataFrame to store only the new features
    df_new_features = pd.DataFrame(index=df.index)

    # Iterate through the features and create interaction terms
    for i in range(len(features)):
        for j in range(i+2, len(features)):
            for k in range(i+3, len(features)):
                if i != j and i != k and j != k:
                    # Generate a new feature name for the interaction term
                    new_feature_name = f"{features[i]}_{features[j]}_{features[k]}"
                    
                    # Create the interaction feature by multiplying the values of the two original features
                    df_new_features[new_feature_name] = np.where(df[features[i]] == 0, 1e-10, df[features[i]]) * np.where(df[features[j]] == 0, 1e-10, df[features[j]]) * np.where(df[features[k]] == 0, 1e-10, df[features[k]])
                    
                    # Add the new feature name to the list of new features
                    new_features.append(new_feature_name)
    
    return df_new_features

In [12]:
def generate_polynomial_features(df, degree, df_features):
    """
    Generate polynomial features for the specified columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    degree : int
        The degree of the polynomial features to generate.
    df_features : list
        A list of feature names to be used for generating polynomial features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the polynomial features added.
    """
    # Get the list of features to create polynomial features
    features = [col for col in df.columns if col in df_features]

    # Create a PolynomialFeatures object with the specified degree, no interaction features, and no bias term
    poly = PolynomialFeatures(degree, interaction_only=False, include_bias=False)

    # Fit and transform the selected features in the DataFrame
    poly_features = poly.fit_transform(df[features])

    # Get the feature names for the generated polynomial features
    poly_features_names = poly.get_feature_names_out(features)

    # Create a new DataFrame with the generated polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly_features_names)

    # Keep only the columns with polynomial features of the specified degree
    poly_df = poly_df[[f"{col}^{degree}" for col in features]]

    # # Concatenate the original DataFrame and the polynomial features DataFrame
    # result = pd.concat([df, poly_df], axis=1)

    return poly_df

In [13]:
def generate_domain_features(df, df_features):
    """
    Generate domain-specific features as ratios between the given columns in a DataFrame.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the original features.
    df_features : list
        A list of feature names to be used for generating domain-specific features.

    Returns:
    --------
    pandas.DataFrame
        The DataFrame with the domain-specific features added.
    """
    # Get the list of features to create domain-specific features
    features = [col for col in df.columns if col in df_features]
    new_features = []

    # Create a new DataFrame to store only the new features
    df_new_features = pd.DataFrame(index=df.index)

    # Iterate through the features and create domain-specific features as ratios
    for i in range(len(features)):
        for j in range(len(features)):
            # Check if the features are different
            if i != j:
                # Generate a new feature name for the domain-specific feature
                new_feature_name = f"{features[i]}_{features[j]}_ratio"
                
                # Create the domain-specific feature by dividing the values of the two original features
                # If the denominator is 0, use a small value (1e-6) to avoid division by zero
                df_new_features[new_feature_name] = np.where(df[features[i]] == 0, 1e-10, df[features[i]]) / np.where(df[features[j]] == 0, 1e-10, df[features[j]])
                
                # Add the new feature name to the list of new features
                new_features.append(new_feature_name)
    
    return df_new_features

In [14]:
# Generate features for the training dataset
inter_features = generate_interactive_features(X_train, X_train.columns)
triple_inter_features = generate_triple_interactive_features(X_train, X_train.columns)
poly_features = generate_polynomial_features(X_train, 2, X_train.columns)
ratio_features = generate_domain_features(X_train, X_train.columns)
X_train = pd.concat([X_train, inter_features, triple_inter_features, poly_features, ratio_features], axis=1)
X_train.shape

(74051, 155)

In [15]:
# Generate features for the training dataset
inter_features = generate_interactive_features(X_test, X_test.columns)
triple_inter_features = generate_triple_interactive_features(X_test, X_test.columns)
poly_features = generate_polynomial_features(X_test, 2, X_test.columns)
ratio_features = generate_domain_features(X_test, X_test.columns)
X_test = pd.concat([X_test, inter_features, triple_inter_features, poly_features, ratio_features], axis=1)
X_test.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight', 'Sex_Length', 'Sex_Diameter',
       ...
       'Viscera Weight_Weight_ratio', 'Viscera Weight_Shucked Weight_ratio',
       'Viscera Weight_Shell Weight_ratio', 'Shell Weight_Sex_ratio',
       'Shell Weight_Length_ratio', 'Shell Weight_Diameter_ratio',
       'Shell Weight_Height_ratio', 'Shell Weight_Weight_ratio',
       'Shell Weight_Shucked Weight_ratio',
       'Shell Weight_Viscera Weight_ratio'],
      dtype='object', length=155)

In [16]:
X_test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_Length,Sex_Diameter,Sex_Height,Sex_Weight,Sex_Shucked Weight,Sex_Viscera Weight,Sex_Shell Weight,Length_Diameter,Length_Height,Length_Weight,Length_Shucked Weight,Length_Viscera Weight,Length_Shell Weight,Diameter_Height,Diameter_Weight,Diameter_Shucked Weight,Diameter_Viscera Weight,Diameter_Shell Weight,Height_Weight,Height_Shucked Weight,Height_Viscera Weight,Height_Shell Weight,Weight_Shucked Weight,Weight_Viscera Weight,Weight_Shell Weight,Shucked Weight_Viscera Weight,Shucked Weight_Shell Weight,Viscera Weight_Shell Weight,Sex_Diameter_Height,Sex_Diameter_Weight,Sex_Diameter_Shucked Weight,Sex_Diameter_Viscera Weight,Sex_Diameter_Shell Weight,Sex_Height_Weight,Sex_Height_Shucked Weight,Sex_Height_Viscera Weight,Sex_Height_Shell Weight,Sex_Weight_Height,Sex_Weight_Shucked Weight,Sex_Weight_Viscera Weight,Sex_Weight_Shell Weight,Sex_Shucked Weight_Height,Sex_Shucked Weight_Weight,Sex_Shucked Weight_Viscera Weight,Sex_Shucked Weight_Shell Weight,Sex_Viscera Weight_Height,Sex_Viscera Weight_Weight,Sex_Viscera Weight_Shucked Weight,Sex_Viscera Weight_Shell Weight,Sex_Shell Weight_Height,Sex_Shell Weight_Weight,Sex_Shell Weight_Shucked Weight,Sex_Shell Weight_Viscera Weight,Length_Height_Weight,Length_Height_Shucked Weight,Length_Height_Viscera Weight,Length_Height_Shell Weight,Length_Weight_Shucked Weight,Length_Weight_Viscera Weight,Length_Weight_Shell Weight,Length_Shucked Weight_Weight,Length_Shucked Weight_Viscera Weight,Length_Shucked Weight_Shell Weight,Length_Viscera Weight_Weight,Length_Viscera Weight_Shucked Weight,Length_Viscera Weight_Shell Weight,Length_Shell Weight_Weight,Length_Shell Weight_Shucked Weight,Length_Shell Weight_Viscera Weight,Diameter_Weight_Shucked Weight,Diameter_Weight_Viscera Weight,Diameter_Weight_Shell Weight,Diameter_Shucked Weight_Viscera Weight,Diameter_Shucked Weight_Shell Weight,Diameter_Viscera Weight_Shucked Weight,Diameter_Viscera Weight_Shell Weight,Diameter_Shell Weight_Shucked Weight,Diameter_Shell Weight_Viscera Weight,Height_Shucked Weight_Viscera Weight,Height_Shucked Weight_Shell Weight,Height_Viscera Weight_Shell Weight,Height_Shell Weight_Viscera Weight,Weight_Viscera Weight_Shell Weight,Sex^2,Length^2,Diameter^2,Height^2,Weight^2,Shucked Weight^2,Viscera Weight^2,Shell Weight^2,Sex_Length_ratio,Sex_Diameter_ratio,Sex_Height_ratio,Sex_Weight_ratio,Sex_Shucked Weight_ratio,Sex_Viscera Weight_ratio,Sex_Shell Weight_ratio,Length_Sex_ratio,Length_Diameter_ratio,Length_Height_ratio,Length_Weight_ratio,Length_Shucked Weight_ratio,Length_Viscera Weight_ratio,Length_Shell Weight_ratio,Diameter_Sex_ratio,Diameter_Length_ratio,Diameter_Height_ratio,Diameter_Weight_ratio,Diameter_Shucked Weight_ratio,Diameter_Viscera Weight_ratio,Diameter_Shell Weight_ratio,Height_Sex_ratio,Height_Length_ratio,Height_Diameter_ratio,Height_Weight_ratio,Height_Shucked Weight_ratio,Height_Viscera Weight_ratio,Height_Shell Weight_ratio,Weight_Sex_ratio,Weight_Length_ratio,Weight_Diameter_ratio,Weight_Height_ratio,Weight_Shucked Weight_ratio,Weight_Viscera Weight_ratio,Weight_Shell Weight_ratio,Shucked Weight_Sex_ratio,Shucked Weight_Length_ratio,Shucked Weight_Diameter_ratio,Shucked Weight_Height_ratio,Shucked Weight_Weight_ratio,Shucked Weight_Viscera Weight_ratio,Shucked Weight_Shell Weight_ratio,Viscera Weight_Sex_ratio,Viscera Weight_Length_ratio,Viscera Weight_Diameter_ratio,Viscera Weight_Height_ratio,Viscera Weight_Weight_ratio,Viscera Weight_Shucked Weight_ratio,Viscera Weight_Shell Weight_ratio,Shell Weight_Sex_ratio,Shell Weight_Length_ratio,Shell Weight_Diameter_ratio,Shell Weight_Height_ratio,Shell Weight_Weight_ratio,Shell Weight_Shucked Weight_ratio,Shell Weight_Viscera Weight_ratio
0,1,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,0.800625,0.28875,9.04916,3.83994,1.815785,2.85763,0.209687,6.571414,2.788528,1.318606,2.075183,2.370018,1.005699,0.475563,0.748427,31.51767,14.903704,23.45501,6.324269,9.952948,4.706433,0.2096875,6.571414,2.788528,1.318606,2.075183,2.370018,1.005699,0.4755629,0.7484268,2.370018,31.51767,14.9037,23.45501,1.005699,31.51767,6.324269,9.952948,0.4755629,14.9037,6.324269,4.706433,0.7484268,23.45501,9.952948,4.706433,2.488519,1.055983,0.499341,0.785848,33.093553,15.64889,24.627761,33.093553,6.640483,10.450596,15.64889,6.640483,4.941755,24.627761,10.450596,4.941755,24.032223,11.364075,17.884445,4.822255,7.589123,4.822255,3.588655,7.589123,3.588655,1.739174,2.737061,1.294269,1.294269,40.561206,1.0,1.1025,0.581406,0.075625,74.274199,13.374274,2.990546,7.406845,0.952381,1.311475,3.636364,0.1160329,0.2734418,0.5782621,0.3674374,1.05,1.377049,3.818182,0.121835,0.287114,0.607175,0.385809,0.7625,0.72619,2.772727,0.088475,0.208499,0.440925,0.280171,0.275,0.261905,0.360656,0.031909,0.075196,0.159022,0.101045,8.618248,8.207855,11.30262,31.339084,2.356589,4.983607,3.166667,3.657085,3.482939,4.796178,13.298493,0.424342,2.114754,1.34375,1.729319,1.646971,2.26796,6.288435,0.200658,0.472868,0.635417,2.721552,2.591954,3.569249,9.896553,0.315789,0.744186,1.57377
1,1,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893,1.1625,0.8875,0.275,15.50718,7.030676,3.246018,3.96893,1.031719,0.319688,18.027093,8.173161,3.773496,4.613881,0.244063,13.762619,6.239725,2.880841,3.522425,4.264474,1.933436,0.892655,1.091456,109.025934,50.33657,61.546898,22.821699,27.904261,12.883217,0.2440625,13.76262,6.239725,2.880841,3.522425,4.264474,1.933436,0.8926549,1.091456,4.264474,109.0259,50.33657,61.5469,1.933436,109.0259,22.8217,27.90426,0.8926549,50.33657,22.8217,12.88322,1.091456,61.5469,27.90426,12.88322,4.95745,2.247619,1.037711,1.268817,126.742648,58.516263,71.548269,126.742648,26.530225,32.438703,58.516263,26.530225,14.97674,71.548269,32.438703,14.97674,96.760516,44.673706,54.622872,20.254258,24.765032,20.254258,11.433855,24.765032,11.433855,6.275967,7.673672,3.542885,3.542885,199.782323,1.0,1.351406,0.787656,0.075625,240.472523,49.430405,10.536631,15.752405,0.8602151,1.126761,3.636364,0.06448627,0.1422338,0.3080698,0.2519571,1.1625,1.309859,4.227273,0.074965,0.165347,0.358131,0.2929,0.8875,0.763441,3.227273,0.057232,0.126233,0.273412,0.223612,0.275,0.236559,0.309859,0.017734,0.039114,0.084719,0.069288,15.50718,13.339507,17.472875,56.389733,2.205645,4.777293,3.907143,7.030676,6.047893,7.921888,25.566095,0.453382,2.165939,1.771429,3.246018,2.792273,3.657485,11.803701,0.209324,0.461694,0.817857,3.96893,3.414133,4.472034,14.432473,0.255941,0.564516,1.222707
2,0,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415,1.2875e-10,9.875e-11,3.25e-11,1.457164e-09,5.556502e-10,3.883882e-10,4.819415e-10,1.271406,0.418438,18.76099,7.153996,5.000497,6.204997,0.320938,14.389497,5.487046,3.835333,4.759172,4.735784,1.805863,1.262261,1.56631,80.967363,56.594535,70.226795,21.580795,26.779089,18.718037,3.209375e-11,1.43895e-09,5.487046e-10,3.835333e-10,4.759172e-10,4.735784e-10,1.805863e-10,1.262261e-10,1.56631e-10,4.735784e-10,8.096736e-09,5.659453e-09,7.022679e-09,1.805863e-10,8.096736e-09,2.15808e-09,2.677909e-09,1.262261e-10,5.659453e-09,2.15808e-09,1.871804e-09,1.56631e-10,7.022679e-09,2.677909e-09,1.871804e-09,6.097322,2.325049,1.625162,2.016624,104.24548,72.865463,90.416998,104.24548,27.785274,34.478077,72.865463,27.785274,24.099472,90.416998,34.478077,24.099472,79.955271,55.887103,69.34896,21.311035,26.44435,21.311035,18.484061,26.44435,18.484061,7.013758,8.703204,6.083362,6.083362,272.752549,0.0,1.657656,0.975156,0.105625,212.33278,30.874714,15.084536,23.226761,7.76699e-11,1.012658e-10,3.076923e-10,6.862644e-12,1.799693e-11,2.574744e-11,2.074941e-11,12875000000.0,1.303797,3.961538,0.088357,0.231711,0.331498,0.267149,9875000000.0,0.76699,3.038462,0.067769,0.17772,0.254256,0.2049,3250000000.0,0.252427,0.329114,0.022304,0.05849,0.083679,0.067436,145716400000.0,11.317781,14.756094,44.835825,2.622449,3.751825,3.023529,55565020000.0,4.31573,5.626837,17.096929,0.381323,1.430657,1.152941,38838820000.0,3.016607,3.933045,11.950405,0.266537,0.69898,0.805882,48194150000.0,3.743235,4.88042,14.828969,0.330739,0.867347,1.240876
3,0,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,1.55e-10,9.875e-11,3.875e-11,2.837785e-09,1.338096e-09,6.548734e-10,7.030676e-10,1.530625,0.600625,43.985667,20.740494,10.150538,10.897548,0.382656,28.023126,13.213702,6.466875,6.942793,10.996417,5.185124,2.537635,2.724387,379.722983,185.839002,199.515465,87.628381,94.077222,46.04203,3.826563e-11,2.802313e-09,1.32137e-09,6.466875e-10,6.942793e-10,1.099642e-09,5.185124e-10,2.537635e-10,2.724387e-10,1.099642e-09,3.79723e-08,1.85839e-08,1.995155e-08,5.185124e-10,3.79723e-08,8.762838e-09,9.407722e-09,2.537635e-10,1.85839e-08,8.762838e-09,4.604203e-09,2.724387e-10,1.995155e-08,9.407722e-09,4.604203e-09,17.044446,8.036942,3.933334,4.2228,588.570623,288.050453,309.248971,588.570623,135.82399,145.819695,288.050453,135.82399,71.365147,309.248971,145.819695,71.365147,374.976445,183.516015,197.021522,86.533026,92.901257,86.533026,45.466505,92.901257,45.466505,33.955997,36.454924,17.841287,17.841287,1306.573812,0.0,2.4025,0.975156,0.150156,805.302342,179.050198,42.885924,49.430405,6.451613e-11,1.012658e-10,2.580645e-10,3.523875e-12,7.473303e-12,1.527013e-11,1.422338e-11,15500000000.0,1.56962,4.0,0.05462,0.115836,0.236687,0.220462,9875000000.0,0.637097,2.548387,0.034798,0.073799,0.150792,0.140456,3875000000.0,0.25,0.392405,0.013655,0.028959,0.059172,0.055116,283778500000.0,18.30829,28.737063,73.23316,2.120763,4.333333,4.03629,133809600000.0,8.63288,13.550343,34.53152,0.471528,2.04329,1.903226,65487340000.0,4.22499,6.63163,16.89996,0.230769,0.489407,0.931452,70306760000.0,4.53592,7.119672,18.14368,0.247752,0.525424,1.073593
4,1,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066,1.1125,0.85,0.2625,11.76504,5.528153,2.466407,3.331066,0.945625,0.292031,13.08861,6.15007,2.743877,3.705811,0.223125,10.000286,4.69893,2.096446,2.831406,3.088324,1.45114,0.647432,0.874405,65.038949,29.017377,39.190136,13.634671,18.414642,8.215763,0.223125,10.00029,4.69893,2.096446,2.831406,3.088324,1.45114,0.6474317,0.8744049,3.088324,65.03895,29.01738,39.19014,1.45114,65.03895,13.63467,18.41464,0.6474317,29.01738,13.63467,8.215763,0.8744049,39.19014,18.41464,8.215763,3.43576,1.614393,0.720268,0.972775,72.355831,32.281832,43.599026,72.355831,15.168572,20.486289,32.281832,15.168572,9.140037,43.599026,20.486289,9.140037,55.283107,24.664771,33.311616,11.589471,15.652446,11.589471,6.983399,15.652446,6.983399,3.579101,4.833844,2.156638,2.156638,96.658806,1.0,1.237656,0.7225,0.068906,138.416225,30.56047,6.083161,11.096002,0.8988764,1.176471,3.809524,0.08499757,0.1808923,0.4054482,0.3002042,1.1125,1.308824,4.238095,0.09456,0.201243,0.451061,0.333977,0.85,0.764045,3.238095,0.072248,0.153758,0.344631,0.255174,0.2625,0.235955,0.308824,0.022312,0.047484,0.10643,0.078804,11.76504,10.575319,13.841226,44.81921,2.128205,4.770115,3.531915,5.528153,4.969126,6.503709,21.059629,0.46988,2.241379,1.659574,2.466407,2.216995,2.901655,9.395834,0.209639,0.446154,0.740426,3.331066,2.994217,3.918901,12.689776,0.283133,0.602564,1.350575


In [17]:
def evaluate_model(model, X, y, n_splits=10):
    """
    Evaluates the given model using cross-validation and calculates the Mean Absolute Errors.

    Parameters:
    -----------
    model (estimator object): The model to be evaluated.
    X (DataFrame): The feature matrix.
    y (Series): The target variable.
    n_splits (int): The number of folds for cross-validation.

    Returns:
    --------
    mae_scores (list): A list of MAE for each fold.
    """
    # Initialize a list to store the MAE
    mae_scores = []

    # Create a KFold object for cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=5)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into train and test sets for the current fold
        X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
        y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training data
        model.fit(X_train_cv, y_train_cv)

        # Predict for the test
        y_pred = model.predict(X_test_cv)

        # Round y_pred to the nearest integer
        y_pred = np.round(y_pred).astype(int)

        # Calculate the MAE for the current fold
        mae = mean_absolute_error(y_test_cv, y_pred)
        mae_scores.append(mae)

    # Return the MAE, rounded to 3 decimal places
    return [round(value, 3) for value in mae_scores]

In [18]:
# Initialize the models

models = {
    "LightGBM": lgb.LGBMRegressor(random_state=5),
    # "CatBoost": CatBoostRegressor(silent=True, random_seed=5),
}

In [19]:
# Evaluate each model using the best subset of features
for name, model in models.items():
    """
    Loops through each model, and evaluates the model using cross-validation. 
    Prints the MAE scores, average MAE, and standard deviation
    for each model.
    """
    #Initialize SFS with the current model
    sfs = SFS(model,
              k_features="best",
              forward=True,
              floating=True,
              scoring="neg_mean_absolute_error",
              cv=3,
              n_jobs=-1)
    
    # Perform SFS on the training data
    sfs = sfs.fit(X_train, y_train)

    # Get the selected features
    selected_features = X_train.columns[list(sfs.k_feature_idx_)]

    #Print the results for the current model
    print(f"Model: {name}")
    print(f"Selected features: {selected_features}")

    # Evaluate the model using cross-validation with the selected features
    mae_scores = evaluate_model(model, X_train[selected_features], y_train)
    mean_mae = np.mean(mae_scores)
    std = np.std(mae_scores)

    # Train the model on the training set
    model.fit(X_train[selected_features], y_train)

    # Predict for the test set
    y_test_pred = model.predict(X_test[selected_features])

    df_test["Age"] = np.round(y_test_pred).astype(int)

    # Save the output DataFrame to a CSV file
    df_test[["id", "Age"]].to_csv(f"submission_{name}.csv", index=False)

    # Print the results for the current model
    print(f"Model: {name}")
    print(f"MAE Scores: {mae_scores}")
    print(f"Average MAE: {mean_mae:.3f}")
    print(f"Std Deviation: {std:.3f}")

    # try:
    #     plt.figure(figsize=(10, 7))
    #     plt.plot(model.feature_importances_, label=name)
    #     plt.xticks(np.arange(X_train.shape[1]), X_train.columns.tolist(), rotation=90)
    #     plt.legend()
    
    # except AttributeError: # Incase the model does not have "feature_importances_"
    #     pass
    
    print()


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

TypeError: 'NoneType' object is not iterable