In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('gym_members_exercise_tracking.csv')

df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [None]:
# Enhanced preprocessing script for the gym members dataset

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('gym_members_exercise_tracking.csv')  # Load the raw dataset

# Check for missing values
missing_values = df.isnull().sum()  # Count missing values in each column

# Handle missing values
# Identify numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns 
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns 

# Fill missing numeric values with median
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median()) 
# Fill missing categorical values with mode
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0]) 


# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['Gender', 'Workout_Type'], drop_first=True)  # One-hot encode categorical features

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 
                      'Session_Duration (hours)', 'Fat_Percentage', 'Water_Intake (liters)', 
                      'Workout_Frequency (days/week)', 'BMI']
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])  # Scale numerical features

# Feature Engineering
df_encoded['Heart_Rate_Zone'] = pd.cut(df['Max_BPM'], bins=[0, 120, 160, 200], labels=['Fat-Burning', 'Cardio', 'Peak'])  # Add heart rate zone feature
df_encoded['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 50], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])  # Add BMI category feature
df_encoded['Activity_Score'] = df['Workout_Frequency (days/week)'] * df['Session_Duration (hours)']  # Add activity score

# Encode new categorical features
df_encoded = pd.get_dummies(df_encoded, columns=['Heart_Rate_Zone', 'BMI_Category'], drop_first=True)  # Encode new categorical features

# Save the enhanced processed dataset
processed_file_path = 'enhanced_processed_gym_data.csv'  # Filepath for the processed dataset
df_encoded.to_csv(processed_file_path, index=False)  # Save the processed dataset to a CSV file

# Display a sample of the processed data
from tabulate import tabulate
print(tabulate(df_encoded.head(), headers='keys', tablefmt='psql'))  # Display the first few rows for verification




+----+------------+---------------+--------------+------------+-----------+---------------+----------------------------+-------------------+------------------+-------------------------+---------------------------------+--------------------+------------+---------------+---------------------+-------------------------+---------------------+------------------+--------------------------+------------------------+-----------------------+---------------------------+----------------------+
|    |        Age |   Weight (kg) |   Height (m) |    Max_BPM |   Avg_BPM |   Resting_BPM |   Session_Duration (hours) |   Calories_Burned |   Fat_Percentage |   Water_Intake (liters) |   Workout_Frequency (days/week) |   Experience_Level |        BMI | Gender_Male   | Workout_Type_HIIT   | Workout_Type_Strength   | Workout_Type_Yoga   |   Activity_Score | Heart_Rate_Zone_Cardio   | Heart_Rate_Zone_Peak   | BMI_Category_Normal   | BMI_Category_Overweight   | BMI_Category_Obese   |
|----+------------+---------

In [None]:
feature_columns = df.columns.drop('Calories_Burned')

