In [2]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [3]:
df=pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [21]:
df['Workout_Type'].unique

<bound method Series.unique of 0          Yoga
1          HIIT
2        Cardio
3      Strength
4      Strength
         ...   
968    Strength
969    Strength
970      Cardio
971        HIIT
972    Strength
Name: Workout_Type, Length: 973, dtype: object>

In [5]:
X=df.drop(labels=['BMI'],axis=1)
y=df['BMI']

In [6]:
numerical_cols=X.select_dtypes(exclude='object').columns
categorical_cols=X.select_dtypes(include='object').columns

In [7]:
numerical_cols

Index(['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM',
       'Session_Duration (hours)', 'Calories_Burned', 'Fat_Percentage',
       'Water_Intake (liters)', 'Workout_Frequency (days/week)',
       'Experience_Level'],
      dtype='object')

In [8]:
categorical_cols

Index(['Gender', 'Workout_Type'], dtype='object')

In [9]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar',StandardScaler())
    ]
)
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoding',OneHotEncoder(handle_unknown='ignore'))
    ]
)
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])


In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.32)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train

Unnamed: 0,num_pipeline__Age,num_pipeline__Weight (kg),num_pipeline__Height (m),num_pipeline__Max_BPM,num_pipeline__Avg_BPM,num_pipeline__Resting_BPM,num_pipeline__Session_Duration (hours),num_pipeline__Calories_Burned,num_pipeline__Fat_Percentage,num_pipeline__Water_Intake (liters),num_pipeline__Workout_Frequency (days/week),num_pipeline__Experience_Level,cat_pipeline__Gender_Female,cat_pipeline__Gender_Male,cat_pipeline__Workout_Type_Cardio,cat_pipeline__Workout_Type_HIIT,cat_pipeline__Workout_Type_Strength,cat_pipeline__Workout_Type_Yoga
0,0.608319,1.423011,0.360370,1.215082,0.308808,-1.102494,-0.215709,-0.145697,-0.661295,0.435806,-1.462773,-1.105856,0.0,1.0,0.0,0.0,0.0,1.0
1,1.013090,-0.415417,-0.967763,-0.950495,-1.300104,-0.968212,0.696779,-0.241921,0.993096,-0.235290,0.732217,0.255356,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.848855,1.639851,0.516621,0.262228,-1.649868,0.777454,0.225817,-0.093884,-0.028734,-0.906385,0.732217,0.255356,0.0,1.0,0.0,1.0,0.0,0.0
3,0.041641,-0.443700,-0.264634,0.868590,0.308808,-0.565366,-0.009663,0.080060,0.311876,-1.241933,0.732217,0.255356,1.0,0.0,0.0,1.0,0.0,0.0
4,-1.253625,-0.401275,-0.811512,1.041836,0.868430,-0.431084,0.343558,0.612994,0.993096,-0.235290,-0.365278,0.255356,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656,-0.848855,-0.533264,-0.967763,-0.170887,-1.020294,-0.028238,-0.451190,-0.715640,1.479681,-1.577481,-0.365278,0.255356,1.0,0.0,0.0,0.0,1.0,0.0
657,0.770227,-1.009370,-0.499010,-0.517380,-0.950341,1.180300,-1.510853,-1.740798,1.301267,-1.409707,-0.365278,-1.105856,1.0,0.0,0.0,0.0,0.0,1.0
658,-0.282176,1.357016,1.688504,0.262228,1.008335,0.240326,-1.216502,-0.641622,-0.628856,-0.403063,-1.462773,-1.105856,0.0,1.0,1.0,0.0,0.0,0.0
659,-1.658396,-1.084793,-1.514642,0.088982,0.238855,-1.371058,0.108077,0.165181,1.301267,-0.906385,-0.365278,0.255356,1.0,0.0,0.0,0.0,0.0,1.0


In [13]:
X_test

Unnamed: 0,num_pipeline__Age,num_pipeline__Weight (kg),num_pipeline__Height (m),num_pipeline__Max_BPM,num_pipeline__Avg_BPM,num_pipeline__Resting_BPM,num_pipeline__Session_Duration (hours),num_pipeline__Calories_Burned,num_pipeline__Fat_Percentage,num_pipeline__Water_Intake (liters),num_pipeline__Workout_Frequency (days/week),num_pipeline__Experience_Level,cat_pipeline__Gender_Female,cat_pipeline__Gender_Male,cat_pipeline__Workout_Type_Cardio,cat_pipeline__Workout_Type_HIIT,cat_pipeline__Workout_Type_Strength,cat_pipeline__Workout_Type_Yoga
0,1.255952,2.049962,-0.811512,-0.690626,-0.040956,-0.833930,-0.627800,-0.619416,-0.304466,-0.570837,0.732217,0.255356,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.496487,0.437802,0.594747,-1.123741,1.148240,1.448864,-1.363677,-0.771154,0.749803,0.435806,-1.462773,-1.105856,0.0,1.0,0.0,0.0,0.0,1.0
2,1.255952,-0.434272,-1.514642,-0.777249,-1.649868,-0.565366,1.432656,0.135574,-1.261417,0.100258,0.732217,1.616568,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.444084,-0.693538,-0.420885,-0.430757,-1.370057,-0.968212,-1.039891,-1.278182,1.414803,-1.241933,-0.365278,-1.105856,1.0,0.0,1.0,0.0,0.0,0.0
4,0.446411,-1.457192,0.438496,-1.123741,1.078288,0.240326,0.696779,0.601891,0.052364,-0.067516,-0.365278,0.255356,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,0.932136,0.612217,-0.733387,-1.643480,-0.040956,-0.565366,1.314916,1.108919,-2.088613,1.442449,0.732217,1.616568,0.0,1.0,1.0,0.0,0.0,0.0
308,0.041641,0.376521,0.204119,-1.296988,1.498004,-0.968212,0.196382,1.090414,0.668705,0.268032,0.732217,0.255356,0.0,1.0,1.0,0.0,0.0,0.0
309,0.932136,-1.065937,2.001006,-0.777249,-1.230152,-1.371058,-0.981021,-1.218967,0.490291,-0.403063,-1.462773,-1.105856,0.0,1.0,0.0,0.0,0.0,1.0
310,0.041641,-1.471334,0.204119,0.088982,-0.880388,-1.371058,-0.362884,-0.604612,0.198339,-0.906385,-1.462773,-1.105856,1.0,0.0,0.0,0.0,1.0,0.0


In [14]:
def evaluate_models(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mse,mae,rmse,r2_square

In [16]:
from sklearn.metrics import r2_score

# Dictionaries to hold models and accuracies
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
accuracy_list = []

for model_name,model in models.items():
    model.fit(X_train,y_train)
    train_accuracy=model.score(X_train,y_train)
    test_accuracy=model.score(X_test,y_test)
    print(f"Model: {model_name}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    accuracy_list.append(test_accuracy)
    model_list.append(model_name)
    print('=='*50)



Model: Linear Regression
Training Accuracy: 0.9856
Test Accuracy: 0.9836
Model: Lasso
Training Accuracy: 0.9126
Test Accuracy: 0.9168
Model: Ridge
Training Accuracy: 0.9856
Test Accuracy: 0.9837
Model: K-Neighbors Regressor
Training Accuracy: 0.8218
Test Accuracy: 0.7732
Model: Decision Tree
Training Accuracy: 1.0000
Test Accuracy: 0.9647
Model: Random Forest Regressor
Training Accuracy: 0.9988
Test Accuracy: 0.9908
Model: XGBRegressor
Training Accuracy: 1.0000
Test Accuracy: 0.9915
Model: CatBoosting Regressor
Training Accuracy: 0.9999
Test Accuracy: 0.9976
Model: AdaBoost Regressor
Training Accuracy: 0.9594
Test Accuracy: 0.9476


In [17]:
# Results
pd.DataFrame(list(zip(model_list, accuracy_list)),columns=['Model Name','Accuracy Score']).sort_values(
    by=["Accuracy Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy Score
7,CatBoosting Regressor,0.997596
6,XGBRegressor,0.991496
5,Random Forest Regressor,0.990768
2,Ridge,0.9837
0,Linear Regression,0.983646
4,Decision Tree,0.964652
8,AdaBoost Regressor,0.947586
1,Lasso,0.916831
3,K-Neighbors Regressor,0.773163


In [19]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create the CatBoost Regressor model
catboost_model = CatBoostRegressor(verbose=0)

# Fit the model to the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = catboost_model.predict(X_test)

# Calculate the performance metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")


Root Mean Squared Error: 0.3263
Mean Absolute Error: 0.2227
R2 Score: 0.9976


