In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import wandb
import plotly.express as px
import plotly.graph_objects as go

import pandas as pd
from feature_engineering import create_features

# Loading the data
obesity_data = pd.read_csv("/work/merged_dataset.csv")
obesity_data = create_features(obesity_data)
obesity_data = obesity_data.fillna(obesity_data.median(numeric_only=True))  # Replacing missing values with median for numeric columns only

le = LabelEncoder()
# Apply LabelEncoder to each column with categorical data separately
for col in obesity_data.columns:
    if obesity_data[col].dtype == 'object' or obesity_data[col].dtype.name == 'category':
        obesity_data[col] = le.fit_transform(obesity_data[col])

# Splitting data into features and target variable
X = obesity_data.drop(["NObeyesdad", "id"], axis=1)
y = obesity_data["NObeyesdad"]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

wandb.init(project="obesity_prediction_model", entity="herczeg-gyrgy")

# Decision Tree Classifier
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
# Evaluating performance
print("Decision Tree - F1 score:", f1_score(y_test, y_pred, average="weighted"))
print("Decision Tree - Accuracy:", accuracy_score(y_test, y_pred))
print("Decision Tree - Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Decision Tree - Recall:", recall_score(y_test, y_pred, average="weighted"))

# Other classifiers implementations
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=5000, n_jobs=-1),
    "K-Nearest Neighbors": KNeighborsClassifier(n_jobs=-1),
    "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Neural Network": MLPClassifier(random_state=42, max_iter=5000),
    "Ridge Classifier": RidgeClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42, n_jobs=-1),
}

metrics = []

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="weighted")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    print(f"\n{name} - F1 score:", f1)
    print(f"{name} - Accuracy:", accuracy)
    print(f"{name} - Precision:", precision)
    print(f"{name} - Recall:", recall)
    metrics.append({
        "model": name,
        "F1 score": f1,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall
    })

wandb.log({"metrics": metrics})

for metric in metrics:
    # Feature importance for tree-based classifiers
    if hasattr(classifiers[metric["model"]], "feature_importances_"):
        feature_importance = classifiers[metric["model"]].feature_importances_
        feature_data = pd.DataFrame({
            "Feature": X_train.columns,
            "Importance": feature_importance
        })
        wandb_table = wandb.Table(dataframe=feature_data)
        wandb.log({f"{metric['model']} Feature Importance": wandb.plot.bar(wandb_table, "Feature", "Importance", title=f"{metric['model']} Feature Importance")})

Decision Tree - F1 score: 0.8502434520163922
Decision Tree - Accuracy: 0.84980323567993
Decision Tree - Precision: 0.8509181743895884
Decision Tree - Recall: 0.84980323567993
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Logistic Regression - F1 score: 0.8521645380752952
Logistic Regression - Accuracy: 0.8533012680367293
Logistic Regression - Precision: 0.8516649925983462
Logistic Regression - Recall: 0.8533012680367293

K-Nearest Neighbors - F1 score: 0.8352647634436812
K-Nearest Neighbors - Accuracy: 0.8358111062527328
K-Nearest Neighbors - Precision: 0.8352186253433749
K-Nearest Neighbors - Recall: 0.8358111062527328

Random Forest - F1 score: 0.9018746642941