In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

Import DataSet

In [4]:
fitness=pd.read_csv("C:\\UOC pdf\\3rd Year\\MachineLearning-01\\final_project\\healthFitnessDataset.csv")

In [6]:
X=fitness.drop(columns=['fitness_level'])
Y=fitness['fitness_level']

Split Data Training and Testing

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=42)
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
Y_train = Y_train.reset_index(drop = True)
Y_test = Y_test.reset_index(drop = True)

Scale Numerical Variables

In [12]:
scaler=StandardScaler()

num_cols=X_train.select_dtypes(include='number').columns
X_train[num_cols]=scaler.fit_transform(X_train[num_cols])

In [14]:
##categorical Data Encoding using Ordinal and One-Hot Encoding

##Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_train['intensity']= ordinal_encoder.fit_transform(X_train[['intensity']])

In [16]:
##Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_train[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_train = X_train.drop(['gender','activity_type','smoking_status'], axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [18]:
##Scaling,Endcoding Testing set

##Scaling
X_test[num_cols]=scaler.fit_transform(X_test[num_cols])

In [20]:
##Endcoding categorical variables

###Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_test[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_test = X_test.drop(['gender','activity_type','smoking_status'], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [22]:
###Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_test['intensity']= ordinal_encoder.fit_transform(X_test[['intensity']])

Fit Gradient Boosting Regressor

In [25]:
gbr=GradientBoostingRegressor(random_state=42)

In [27]:
gbr.fit(X_train,Y_train)

In [29]:
Y_pred_gbr=gbr.predict(X_test)
Y_pred_train_gbr=gbr.predict(X_train)

In [31]:
mae_gbr_test=mean_absolute_error(Y_test,Y_pred_gbr)
mae_gbr_train=mean_absolute_error(Y_train,Y_pred_train_gbr)

print("Testing MAE:",mae_gbr_test)
print("Training MAE:",mae_gbr_train)

Testing MAE: 1.9236754477328468
Training MAE: 1.8422880123082546


In [33]:
mse_test_gbr=mean_squared_error(Y_test,Y_pred_gbr)
mse_train_gbr=mean_squared_error(Y_train,Y_pred_train_gbr)

print("Testing MSE:",mse_test_gbr)
print("Training MSE:",mse_train_gbr)

Testing MSE: 6.0229217382367555
Training MSE: 5.5586858029064325


In [35]:
print("R2 score:",r2_score(Y_test,Y_pred_gbr))

R2 score: 0.803966182409201


Fit Gradient Boosting Model with Hyperparameter Tuninng(GridSerach Method)

In [63]:
param_grid_gbr={
    'n_estimators': [300, 500,800,1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 4],
    'min_samples_split': [2, 5, 10],
}

gbr_cv=GridSearchCV(estimator=gbr,param_grid=param_grid_gbr,cv=5,n_jobs=-1)

In [65]:
gbr_cv.fit(X_train,Y_train)

In [83]:
gbr_cv.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'min_samples_split': 5,
 'n_estimators': 1000}

In [67]:
Y_pred_gbr_cv=gbr_cv.predict(X_test)
Y_pred_train_gbr_cv=gbr_cv.predict(X_train)

In [75]:
test_mae_gbr_cv=mean_absolute_error(Y_pred_gbr_cv,Y_test)
train_mae_gbr_cv=mean_absolute_error(Y_pred_train_gbr_cv,Y_train)

print("Testing MAE(HyperParameter Tuninng):",test_mae_gbr_cv)
print("Training MAE(Hyperparameter Tuninng):",train_mae_gbr_cv)

Testing MAE(HyperParameter Tuninng): 1.77756056157708
Training MAE(Hyperparameter Tuninng): 0.8943332052180264


In [77]:
mse_test_gbr_cv=mean_squared_error(Y_pred_gbr_cv,Y_test)
mse_train_gbr_cv=mean_squared_error(Y_pred_train_gbr_cv,Y_train)

print("Testing Mean Square Error(Hyper Parameter Tuninng):",mse_test_gbr_cv)
print("Training Mean Sqaure Error(Hyper Parameter Tuninng):",mse_train_gbr_cv)

Testing Mean Square Error(Hyper Parameter Tuninng): 5.248081107708382
Training Mean Sqaure Error(Hyper Parameter Tuninng): 1.4252153472913112


In [79]:
r_2Score_cv=r2_score(Y_test,Y_pred_gbr_cv)
print("R2 score(HyperParameter Tuninng):",r_2Score_cv)

R2 score(HyperParameter Tuninng): 0.8291856644859193
