In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("obesity_raw.csv")

# Convert to DataFrame
df = pd.DataFrame(data)

print("DataFrame head:")
print(data.head())
print("\nDataFrame info:")
df.info()

# Encode 'Gender' and 'Label' columns
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Label'] = label_encoder.fit_transform(df['Label'])

# Define independent (X) and dependent (y) variables
X = df[['Age', 'Gender']]
y = df['Label']

# Fit the regression model
model = LinearRegression()
model.fit(X, y)

# Predictions
y_pred = model.predict(X)

# Model evaluation
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

# Display regression coefficients and evaluation metrics
print("Regression Coefficients:")
print(f"Age: {model.coef_[0]}, Gender_encoded: {model.coef_[1]}")
print(f"Intercept: {model.intercept_}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Create a table summarizing results
results_table = pd.DataFrame({
    "Actual Obesity Level": y,
    "Predicted Obesity Level": y_pred
})
print("\nResults Table:")
print(results_table)

# Plot actual vs. predicted obesity levels
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y, y=y_pred, color="blue", label="Predictions")
plt.plot([min(y), max(y)], [min(y), max(y)], color="red", linestyle="--", label="Ideal Fit")
plt.xlabel("Actual Obesity Level")
plt.ylabel("Predicted Obesity Level")
plt.title("Actual vs Predicted Obesity Levels")
plt.legend()
plt.grid(True)
plt.show()

# Add a constant to the independent variables for the intercept term
X_const = sm.add_constant(X)

# Fit the regression model using statsmodels
model_sm = sm.OLS(y, X_const).fit()

# Summary of the regression model
print(model_sm.summary())
'''
The code performs a linear regression analysis to predict obesity levels based on age and gender. Initially, the data is organized into a
Pandas DataFrame. The `Gender` and `Label` columns are encoded into numerical values using `LabelEncoder`. The independent variables (`Age` and encoded `Gender`) and the dependent
 variable (`Obesity Level`) are defined. A linear regression model is fitted using the `LinearRegression` class from `sklearn`. Predictions are made, and the model is evaluated
 using metrics like Mean Squared Error (MSE) and R-squared (RÂ²). The regression coefficients, intercept, and evaluation metrics are printed. A results table is created to compare
 actual and predicted obesity levels. Additionally, a scatter plot is generated to visualize the relationship between actual and predicted values, with a red line indicating the
  ideal fit. Finally, the `statsmodels` library is used to fit the regression model and provide a detailed statistical summary of the results.
'''

In [None]:
from sklearn.model_selection import train_test_split

# Define independent variables (features) X
X = data[['Age', 'Gender']]

# Define dependent variable (target) y
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Instantiate the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = dt_model.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Print the accuracy score
print(f"Decision Tree Classifier Accuracy: {accuracy_dt:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Print the accuracy score
print(f"Random Forest Classifier Accuracy: {accuracy_rf:.4f}")

In [None]:
from sklearn.neural_network import MLPClassifier

# Instantiate the Neural Network model
nn_model = MLPClassifier(random_state=42, max_iter=1000) # Increased max_iter to ensure convergence

# Train the model
nn_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nn = nn_model.predict(X_test)

# Calculate accuracy
accuracy_nn = accuracy_score(y_test, y_pred_nn)

# Print the accuracy score
print(f"Neural Network Model Accuracy: {accuracy_nn:.4f}")

In [None]:
# Get feature names from the training data
feature_names = X_train.columns

# Get feature importances from the Random Forest model
feature_importances = rf_model.feature_importances_

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(8, 6))
sns.barplot(x='Feature', y='Importance', data=importance_df)
plt.title('Random Forest Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.grid(True)
plt.show()