In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

## Assessing the Dataset

In [None]:
df = pd.read_csv("obesity_data.csv")

df.head()

In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
df.info()

## Visualizing the data in histogram

In [None]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
fig, ax = plt.subplots(2, len(numerical_features)//2, figsize=(20, 8))
for i, feature in enumerate(numerical_features):
  sns.histplot(df[feature], ax=ax[i//(len(numerical_features)//2), i%(len(numerical_features)//2)], kde=True)
  ax[i//(len(numerical_features)//2), i%(len(numerical_features)//2)].set_title(feature)
plt.tight_layout()
plt.show()

In [None]:
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
fig, ax = plt.subplots(2, len(numerical_features)//2, figsize=(20, 8))
for i, feature in enumerate(numerical_features):
  row = i // (len(categorical_features)//2)
  col = i % (len(categorical_features)//2)
  sns.boxplot(df[feature], ax=ax[row, col], orient='h')
  ax[row, col].set_title(feature)
plt.tight_layout()
plt.show()

### NCP: How many main meals do you have daily?

In [None]:
# Dibuat menjadi bilangan bulat
df['NCP'] = df['NCP'].round(0)

In [None]:
fig, ax = plt.subplots(3, len(categorical_features)//3, figsize=(20, 20))

for i, feature in enumerate(categorical_features):
    row = i // (len(categorical_features)//3)
    col = i % (len(categorical_features)//3)
    sns.countplot(data=df, x=feature, hue=feature, ax=ax[row, col])
    ax[row, col].set_title(f"Countplot for {feature.replace('_', ' ').title()}")
    ax[row, col].tick_params(axis='x', rotation=90)
    ax[row, col].set_xlabel(feature.replace('_', ' ').title())

plt.tight_layout()
plt.show()


In [None]:
encoder = LabelEncoder()
df[categorical_features] = df[categorical_features].apply(encoder.fit_transform)
df.head()

In [None]:
# Check data types after encoding
df.info()

In [None]:
corr = df.corr()
plt.figure(figsize=(14, 14))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title("Correlation Heatmap")
plt.show()

In [None]:
target_corr = corr['NObeyesdad'].sort_values(ascending=False)

labels = target_corr.index

plt.figure(figsize=(10, 6))
sns.barplot(x=target_corr.values, y=labels, palette='viridis', hue=target_corr.values)
plt.axvline(x=0, color='red', linestyle='--')
for i, v in enumerate(target_corr.values):
    plt.text(v + 0.01, i, f"{v:.3f}", color='red', va='center', fontsize=10)
plt.legend(title="Corr Coefficient", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title("Correlation with Target Variable")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.show()

In [None]:
# do some feature selection using SelectKBest and mutual_info_classif as it works well with both categorical and continuous features
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = df.drop(columns=['NObeyesdad'])
y = df['NObeyesdad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using SelectKBest (select top 10 features)
selector = SelectKBest(score_func=mutual_info_classif, k=10)
selector.fit(X_train, y_train)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)
print("Dropped Features:", X.columns[~selector.get_support()])

In [None]:
scores = pd.Series(selector.scores_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=scores.values, y=scores.index, palette="viridis", hue=scores.values)
for i, v in enumerate(scores.values):
    plt.text(v + 0.01, i, f"{v:.3f}", color='red', va='center', fontsize=10)
plt.title("Feature Importance Scores (mutual_info_classif)")
plt.xlabel("Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:
# Scale the data using standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[selected_features])
X_test_scaled = scaler.transform(X_test[selected_features])
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features)



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV


rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
y_pred = best_rf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()