In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.linear_model as linear_model
import sklearn.ensemble as ensemble
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
import sklearn.preprocessing as preprocessing

import optuna

# ROOT = '/kaggle/input/playground-series-s4e2'
ROOT = 'competition_data'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv(os.path.join(ROOT, 'train.csv'))

In [3]:
X_raw = df_train.drop(columns=["NObeyesdad"])
y = df_train["NObeyesdad"]

def feature_eng(df_X_raw):
    # Set aside ID column for now
    X_id = df_X_raw[["id"]]

    # Convert categorical data to numerical features
    cat_cols = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"]
    feature_cats = [["Female", "Male"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["Automobile", "Motorbike", "Public_Transportation", "Bike", "Walking"]]
    enc = preprocessing.OrdinalEncoder(categories=feature_cats)
    X_encoded = enc.fit_transform(df_X_raw[cat_cols])
    X_cat = pd.DataFrame(X_encoded, df_X_raw.index, cat_cols)

    # Normalize numerical features
    norm_cols = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
    # scaler = preprocessing.StandardScaler()
    # X_norm = scaler.fit_transform(df_X_raw[norm_cols])
    # X_norm = pd.DataFrame(X_norm, df_X_raw.index, norm_cols)
    X_norm = df_X_raw[norm_cols]

    X = pd.concat([X_id, X_norm, X_cat], axis=1)

    # New features with some predictive power
    X["BMI"] = X["Weight"] / X["Height"]**2
    X["Veg_and_water"] = X["FCVC"] * X["CH2O"]
    X["Discounted_activity"] = X["FAF"] / (X["TUE"] + 1)
    X["High_cal_snacking"] = X["FAVC"] * X["CAEC"]
    return X

X = feature_eng(X_raw)

Some quick model tests:
- Logistic regression = 67-73% accuracy (with data normalization)
- Linear SVM = too slow
- Basic Decision Tree = 77-85% accuracy
- Random Forest = 89.4-90.4% accuracy
- Histogram-based Gradient Boosting = 82.2-90.8% accuracy (and takes only 10s to evaluate)

In [4]:
# Evaluate logistic regression on this dataset
# model = linear_model.LogisticRegression(max_iter=1000)
# model = svm.SVC(kernel='linear')
# model = tree.DecisionTreeClassifier()
# model = ensemble.RandomForestClassifier()
model = ensemble.HistGradientBoostingClassifier()

scores = model_selection.cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(scores)

# y_pred = model.predict(X_test)
# accuracy = metrics.accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

[0.87837187 0.89450867 0.9077553  0.89809684 0.83594315]


In [5]:
# Hyperparameter optimization with optuna
def objective(trial):
    # Define the search space
    max_depth = trial.suggest_int('max_depth', 2, 10)
    n_estimators = trial.suggest_int('n_estimators', 10, 300)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)

    # Instantiate the model
    model = ensemble.HistGradientBoostingClassifier(max_depth=max_depth, 
                                                    max_iter=n_estimators,
                                                    learning_rate=learning_rate, 
                                                    max_leaf_nodes=max_leaf_nodes)

    # Evaluate the model
    scores = model_selection.cross_val_score(model, X, y, cv=5, scoring='accuracy')
    accuracy = scores.mean()
    return accuracy

In [6]:
# # Comment out this block when scoring on the test data
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

[I 2024-02-08 15:20:27,929] A new study created in memory with name: no-name-b57071cc-48f4-4ae0-b358-448b5b026ff3
[I 2024-02-08 15:21:32,955] Trial 0 finished with value: 0.8733502477987756 and parameters: {'max_depth': 7, 'n_estimators': 271, 'max_leaf_nodes': 70, 'learning_rate': 0.0004917019689861579}. Best is trial 0 with value: 0.8733502477987756.
[I 2024-02-08 15:22:18,789] Trial 1 finished with value: 0.8952210600876638 and parameters: {'max_depth': 6, 'n_estimators': 235, 'max_leaf_nodes': 71, 'learning_rate': 0.008625953261228283}. Best is trial 1 with value: 0.8952210600876638.
[I 2024-02-08 15:22:42,416] Trial 2 finished with value: 0.8925235416959676 and parameters: {'max_depth': 7, 'n_estimators': 85, 'max_leaf_nodes': 95, 'learning_rate': 0.013150191392593767}. Best is trial 1 with value: 0.8952210600876638.
[I 2024-02-08 15:22:57,360] Trial 3 finished with value: 0.8852960425999447 and parameters: {'max_depth': 10, 'n_estimators': 132, 'max_leaf_nodes': 65, 'learning_rat

In [7]:
# Create the model with the optimal hyperparameters
model = ensemble.HistGradientBoostingClassifier(max_depth=8,
                                                max_iter=220,
                                                learning_rate=0.0088,
                                                max_leaf_nodes=76)

scores = model_selection.cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(scores)


[0.89258189 0.89210019 0.9026975  0.90002409 0.89086967]


In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model on training set to reduce model bias
# model.fit(X_train, y_train)
model.fit(X, y)
y_pred_test = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred_test)
print(f'Accuracy: {accuracy:.3f}')


Accuracy: 0.932


In [18]:
# # OPTIONAL: Examine misclassified samples from the training set
# y_side_by_side = pd.DataFrame({"y_test": y_test, "y_pred": y_pred_test})
# y_diff = y_side_by_side[y_side_by_side["y_test"] != y_side_by_side["y_pred"]]
# label_cats = [["Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"],
#               ["Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"]]
# enc = preprocessing.OrdinalEncoder(categories=label_cats)
# y_diff = enc.fit_transform(y_diff)
# y_diff = pd.DataFrame(y_diff, columns=["y_test", "y_pred"])

# y_diff["class_diff"] = np.abs(y_diff["y_test"] - y_diff["y_pred"])
# np.count_nonzero(y_diff["class_diff"] > 1)


38

In [9]:
# Make final predictions
df_test = pd.read_csv(os.path.join(ROOT, 'test.csv'))
X_test_final = feature_eng(df_test)

y_pred = model.predict(X_test_final)
df_test["NObeyesdad"] = y_pred

In [10]:
# Export results for submission
df_test[["id", "NObeyesdad"]].to_csv("submission.csv", index=False)