In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

Import DataSet

In [3]:
fitness=pd.read_csv("C:\\UOC pdf\\3rd Year\\MachineLearning-01\\final_project\\healthFitnessDataset.csv")

In [4]:
X=fitness.drop(columns=['fitness_level'])
Y=fitness['fitness_level']

Split Data Training and Testing

In [6]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=42)
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
Y_train = Y_train.reset_index(drop = True)
Y_test = Y_test.reset_index(drop = True)

Scale Numerical Variables

In [8]:
scaler=StandardScaler()

num_cols=X_train.select_dtypes(include='number').columns
X_train[num_cols]=scaler.fit_transform(X_train[num_cols])

In [9]:
##categorical Data Encoding using Ordinal and One-Hot Encoding

##Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_train['intensity']= ordinal_encoder.fit_transform(X_train[['intensity']])

In [10]:
##Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_train[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_train = X_train.drop(['gender','activity_type','smoking_status'], axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [11]:
##Scaling,Endcoding Testing set

##Scaling
X_test[num_cols]=scaler.fit_transform(X_test[num_cols])

In [12]:
##Endcoding categorical variables

###Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_test[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_test = X_test.drop(['gender','activity_type','smoking_status'], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [13]:
###Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_test['intensity']= ordinal_encoder.fit_transform(X_test[['intensity']])

Fit Random Forest Regressor Model

In [15]:
rfr=RandomForestRegressor(random_state=42)

In [16]:
rfr.fit(X_train,Y_train)

In [17]:
Y_pred_rfr=rfr.predict(X_test)
Y_train_Pred_rfr=rfr.predict(X_train)

In [18]:
mae_test_rfr=mean_absolute_error(Y_test,Y_pred_rfr)
mae_train_rfr=mean_absolute_error(Y_train,Y_train_Pred_rfr)

print("Testing MAE:",mae_test_rfr)
print("Training MAE:",mae_train_rfr)

Testing MAE: 1.9467176400000001
Training MAE: 0.70505188


In [19]:
mse_test_rfr=mean_squared_error(Y_test,Y_pred_rfr)
mse_train_rfr=mean_squared_error(Y_test,Y_pred_rfr)

print("Testing MSE:",mse_test_rfr)
print("Training MSE:",mse_train_rfr)

Testing MSE: 6.247285719902666
Training MSE: 6.247285719902666


In [20]:
print("R2 score:",r2_score(Y_pred_rfr,Y_test))

R2 score: 0.7435736922302225


Fit Random Forest Model with Hyperparameter Tuninng(GridSerach Method)

In [22]:
param_grdi_rfr={
    'n_estimators': [100, 300, 500,1000],
    'max_depth': [5, 10, 20, 30],
}

rfr_cv=GridSearchCV(estimator=rfr,param_grid=param_grdi_rfr,cv=5,n_jobs=-1)

In [23]:
rfr_cv.fit(X_train,Y_train)

In [59]:
rfr_cv.best_params_

{'max_depth': 10, 'n_estimators': 1000}

In [51]:
Y_pred_rfr_cv=rfr_cv.predict(X_test)
Y_pred_train_rfr_cv=rfr_cv.predict(X_train)

In [52]:
test_mae_rfr_cv=mean_absolute_error(Y_pred_rfr_cv,Y_test)
train_mae_rfr_cv=mean_absolute_error(Y_pred_train_rfr_cv,Y_train)

print("Testing MAE(HyperParameter Tuninng):",test_mse_rfr_cv)
print("Training MAE(Hyperparameter Tuninng):",train_mse_rfr_cv)

Testing MSE(HyperParameter Tuninng): 1.929102730515228
Training MSE(Hyperparameter Tuninng): 1.5642741855209743


In [55]:
mse_test_rfr_cv=mean_squared_error(Y_pred_rfr_cv,Y_test)
mse_train_rfr_cv=mean_squared_error(Y_pred_train_rfr_cv,Y_train)

print("Mean Square Error(Hyper Parameter Tuninng):",mse_test_rfr_cv)
print("Mean Sqaure Error(Hyper Parameter Tuninng):",mse_train_rfr_cv)

Mean Square Error(Hyper Parameter Tuninng): 6.126006166903017
Mean Sqaure Error(Hyper Parameter Tuninng): 3.922764717261359


In [57]:
r_2Score_cv=r2_score(Y_test,Y_pred_rfr_cv)
print("R2 score(HyperParameter Tuninng):",r_2Score_cv)

R2 score(HyperParameter Tuninng): 0.8006109945180282
