In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nadeemajeedch_fitness_tracker_dataset_path = kagglehub.dataset_download('nadeemajeedch/fitness-tracker-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Load Dataset and EDA**

In [None]:
data = pd.read_csv('/kaggle/input/gym_members_exercise_tracking_synthetic_data.csv')

In [None]:
data.head()

In [None]:
# dataset information
print(data.info())

In [None]:
# Descriptive statistics
print(data.describe())


**Age: Median age is around 33 years. Most participants are young adults aged 20–40.BMI: Median BMI is approximately 19.96, indicating a healthy average range.Calories_Burned: Median is ~1,034, with a broad range from 303 to 1,783.**


In [None]:
# checking for null values
print(data.isnull().sum())

In [None]:
# Handle missing values
# Numerical columns: Fill missing values with the median
num_cols = data.select_dtypes(include=['float64']).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Categorical columns: Fill missing values with the mode
cat_cols = data.select_dtypes(include=['object']).columns
data[cat_cols] = data[cat_cols].fillna(data[cat_cols].mode().iloc[0])

# Convert 'Max_BPM' to numeric (handle non-numeric values if any)
data['Max_BPM'] = pd.to_numeric(data['Max_BPM'], errors='coerce')
data['Max_BPM'] = data['Max_BPM'].fillna(data['Max_BPM'].median())

In [None]:
# Check for missing values (after cleaning)
missing_values = data.isnull().sum()
print("Missing Values after Cleaning:")
print(missing_values)

**After cleaning, all missing values were handled. Numerical columns had missing values filled with their medians, and categorical columns with their modes.**

In [None]:
# Distribution of numerical features
sns.set(style="whitegrid", palette="muted")
plt.figure(figsize=(14, 10))

num_features = [
    "Age", "Weight (kg)", "Height (m)", "BMI", "Calories_Burned",
    "Session_Duration (hours)", "Fat_Percentage", "Water_Intake (liters)"
]

for i, feature in enumerate(num_features, 1):
    plt.subplot(4, 2, i)
    sns.histplot(data[feature], kde=True, bins=30, color="skyblue")
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()



**Age, Weight, and BMI show right-skewed distributions, indicating a concentration in the lower ranges.Calories_Burned: A wide range with a peak near 1,000 calories.Session_Duration: Most users work out between 1–1.5 hours per session.**


In [None]:
# Boxplots for detecting outliers
plt.figure(figsize=(14, 10))

for i, feature in enumerate(num_features, 1):
    plt.subplot(4, 2, i)
    sns.boxplot(data=data[feature], color="orange")
    plt.title(f"Boxplot of {feature}")
    plt.xlabel(feature)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numerical_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix of Numerical Features")
plt.show()


**Calories_Burned strongly correlates with Session_Duration and Avg_BPM, which is expected since longer and more intense workouts burn more calories.BMI correlates moderately with Fat_Percentage and Weight.Weak correlations exist between Age and most other variables.**


In [None]:
# Categorical data distributions
categorical_features = ["Gender", "Workout_Type"]
plt.figure(figsize=(12, 5))

for i, feature in enumerate(categorical_features, 1):
    plt.subplot(1, 2, i)
    sns.countplot(data=data, x=feature, palette="Set2")
    plt.title(f"Countplot of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Count")

plt.tight_layout()
plt.show()


**Gender Distribution:Fairly balanced between male and female participants.**



**Workout_Type:Most participants engage in either strength training or cardio**


# **Decsision Tree Model**

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
# Define target variable and features
target = "Calories_Burned"
features = data.drop(columns=[target, "Gender", "Workout_Type"])  # Drop target and categorical columns

X = features  # Features
y = data[target]  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#  Model Training
# Initialize Random Forest Regressor
dt_model = DecisionTreeRegressor(random_state=38)

# Train the model
dt_model.fit(X_train, y_train)

In [None]:
# Model Evaluation
# Make predictions

y_pred = dt_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

In [None]:
# Visualization

# Scatter plot of Actual vs Predicted values
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7, color="blue")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--")
plt.title("Actual vs Predicted Calories Burned")
plt.xlabel("Actual Calories Burned")
plt.ylabel("Predicted Calories Burned")
plt.show()


In [None]:
# Feature Importance Plot
feature_importances = pd.Series(dt_model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_importances.index, palette="viridis")
plt.title("Feature Importance in Random Forest Model")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()