In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.linear_model as linear_model
import sklearn.ensemble as ensemble
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
import sklearn.preprocessing as preprocessing

# ROOT = '/kaggle/input/playground-series-s4e2'
ROOT = 'competition_data'

In [2]:
df_train = pd.read_csv(os.path.join(ROOT, 'train.csv'))

In [3]:
X_raw = df_train.drop(columns=["NObeyesdad"])
y = df_train["NObeyesdad"]

def feature_eng(df_X_raw):
    # Set aside ID column for now
    X_id = df_X_raw[["id"]]

    # Convert categorical data to numerical features
    cat_cols = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"]
    feature_cats = [["Female", "Male"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["Automobile", "Motorbike", "Public_Transportation", "Bike", "Walking"]]
    enc = preprocessing.OrdinalEncoder(categories=feature_cats)
    X_encoded = enc.fit_transform(df_X_raw[cat_cols])
    X_cat = pd.DataFrame(X_encoded, df_X_raw.index, cat_cols)

    # Normalize numerical features
    norm_cols = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
    # scaler = preprocessing.StandardScaler()
    # X_norm = scaler.fit_transform(df_X_raw[norm_cols])
    # X_norm = pd.DataFrame(X_norm, df_X_raw.index, norm_cols)
    X_norm = df_X_raw[norm_cols]

    X = pd.concat([X_id, X_norm, X_cat], axis=1)

    # New features with some predictive power
    X["BMI"] = X["Weight"] / X["Height"]**2
    X["Veg_and_water"] = X["FCVC"] * X["CH2O"]
    X["Discounted_activity"] = X["FAF"] / (X["TUE"] + 1)
    return X

X = feature_eng(X_raw)

Some quick model tests:
- Logistic regression = 67-73% accuracy (with data normalization)
- Linear SVM = too slow
- Basic Decision Tree = 77-85% accuracy
- Random Forest = 89.4-90.4% accuracy
- Histogram-based Gradient Boosting = 81.1-90.2% accuracy (and takes only 10s to evaluate)

In [7]:
# Evaluate logistic regression on this dataset
# model = linear_model.LogisticRegression(max_iter=1000)
# model = svm.SVC(kernel='linear')
# model = tree.DecisionTreeClassifier()
# model = ensemble.RandomForestClassifier()
model = ensemble.HistGradientBoostingClassifier()

scores = model_selection.cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(scores)

# y_pred = model.predict(X_test)
# accuracy = metrics.accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

[0.86319846 0.8966763  0.90004817 0.90243315 0.81112985]


In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model on training set to reduce model bias
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred_test)
print(f'Accuracy: {accuracy:.3f}')


Accuracy: 0.903


In [9]:
# Make final predictions
df_test = pd.read_csv(os.path.join(ROOT, 'test.csv'))
X_test_final = feature_eng(df_test)

y_pred = model.predict(X_test_final)
df_test["NObeyesdad"] = y_pred

In [10]:
# Export results for submission
df_test[["id", "NObeyesdad"]].to_csv("submission.csv", index=False)