In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, cross_val_score

In [2]:
csv_file_path = "data_with_junk_food_solid.csv"
df = pd.read_csv(csv_file_path)

df_numeric = df.drop(columns=["Food code", "Main food description", "WWEIA Category description"])

X = df_numeric.drop(columns=["Junk Food"])
y = df_numeric["Junk Food"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
k_5_fold = KFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=k_5_fold, scoring='accuracy')
#     results.append({"Model": name, "Accuracy": accuracy, "F1 Score": f1})
    results.append({"Model": name, "Accuracy": accuracy, "F1 Score": f1, "Mean Accuracy":cv_scores.mean()
                    , "Std":cv_scores.std()})

results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print(results_df)

                    Model  Accuracy  F1 Score  Mean Accuracy       Std
1       Gradient Boosting  0.998883  0.998870       0.998322  0.002712
0           Random Forest  0.987709  0.985700       0.985463  0.004295
4     K-Nearest Neighbors  0.978771  0.974398       0.978755  0.004446
3  Support Vector Machine  0.976536  0.965964       0.974840  0.002330
2     Logistic Regression  0.972067  0.964962       0.969528  0.005687
