In [13]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.linear_model as linear_model
import sklearn.ensemble as ensemble
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
import sklearn.preprocessing as preprocessing

# ROOT = '/kaggle/input/playground-series-s4e2'
ROOT = 'competition_data'

In [14]:
df_train = pd.read_csv(os.path.join(ROOT, 'train.csv'))

In [15]:
X_raw = df_train.drop(columns=["NObeyesdad"])
y = df_train["NObeyesdad"]

def feature_eng(df_X_raw):
    # Convert categorical data to numerical features
    X_id = df_X_raw[["id"]]

    cat_cols = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"]
    feature_cats = [["Female", "Male"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["yes", "no"],
                    ["yes", "no"],
                    ["no", "Sometimes", "Frequently", "Always"],
                    ["Automobile", "Motorbike", "Public_Transportation", "Bike", "Walking"]]
    enc = preprocessing.OrdinalEncoder(categories=feature_cats)
    X_encoded = enc.fit_transform(df_X_raw[cat_cols])
    X_cat = pd.DataFrame(X_encoded, df_X_raw.index, cat_cols)

    norm_cols = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
    scaler = preprocessing.StandardScaler()
    X_norm = scaler.fit_transform(df_X_raw[norm_cols])
    X_norm = pd.DataFrame(X_norm, df_X_raw.index, norm_cols)

    X = pd.concat([X_id, X_norm, X_cat], axis=1)
    return X

X = feature_eng(X_raw)

Some quick model tests:
- Logistic regression = 67-73% accuracy
- Linear SVM = too slow
- Basic Decision Tree = 77-85% accuracy
- Random Forest = 88.7-90.5% accuracy
- Histogram-based Gradient Boosting = 89.0-90.5% accuracy (and takes only 10s to evaluate)

In [16]:
# Evaluate logistic regression on this dataset
# model = linear_model.LogisticRegression(max_iter=1000)
# model = svm.SVC(kernel='linear')
# model = tree.DecisionTreeClassifier()
# model = ensemble.RandomForestClassifier()
# model = ensemble.HistGradientBoostingClassifier()

# scores = model_selection.cross_val_score(model, X, y, cv=5, scoring='accuracy')
# print(scores)

# y_pred = model.predict(X_test)
# accuracy = metrics.accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

[0.88342967 0.89402697 0.90582852 0.90146953 0.81498434]


In [17]:
# Fit model on all training data and predict on test data
model.fit(X, y)

df_test = pd.read_csv(os.path.join(ROOT, 'test.csv'))
X_test = feature_eng(df_test)

y_pred = model.predict(X_test)
df_test["NObeyesdad"] = y_pred

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation,Obesity_Type_II
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation,Obesity_Type_III
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation,Obesity_Type_I
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation,Obesity_Type_III


In [18]:
# Export results for submission
df_test[["id", "NObeyesdad"]].to_csv("submission.csv", index=False)