In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler

Import DataSet

In [4]:
fitness=pd.read_csv("C:\\UOC pdf\\3rd Year\\MachineLearning-01\\final_project\\healthFitnessDataset.csv")

In [6]:
X=fitness.drop(columns=['fitness_level'])
Y=fitness['fitness_level']

Split Data Training and Testing

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.7,random_state=42)
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
Y_train = Y_train.reset_index(drop = True)
Y_test = Y_test.reset_index(drop = True)

Scale Numerical Variables

In [12]:
scaler=StandardScaler()

num_cols=X_train.select_dtypes(include='number').columns
X_train[num_cols]=scaler.fit_transform(X_train[num_cols])

In [14]:
##categorical Data Encoding using Ordinal and One-Hot Encoding

##Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_train['intensity']= ordinal_encoder.fit_transform(X_train[['intensity']])

In [16]:
##Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_train[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_train = X_train.drop(['gender','activity_type','smoking_status'], axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [18]:
##Scaling,Endcoding Testing set

##Scaling
X_test[num_cols]=scaler.fit_transform(X_test[num_cols])

In [20]:
##Endcoding categorical variables

###Nominal Endcoding
# Create encoder
onehot_encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_data = onehot_encoder.fit_transform(X_test[['gender','activity_type','smoking_status']])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(['gender', 'activity_type', 'smoking_status']))

# Combine with the original dataset
X_test = X_test.drop(['gender','activity_type','smoking_status'], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [22]:
###Ordinal Endcoding
# Define the order manually
intensity_order = ['Low','Medium','High']
# Create encoder
ordinal_encoder = OrdinalEncoder(categories=[intensity_order])
# Fit and transform
X_test['intensity']= ordinal_encoder.fit_transform(X_test[['intensity']])

Fit Gradient DecisionTree Regressor

In [33]:
dtr=DecisionTreeRegressor(random_state=42)

In [35]:
dtr.fit(X_train,Y_train)

In [37]:
Y_pred_dtr=dtr.predict(X_test)
Y_pred_train_dtr=dtr.predict(X_train)

In [44]:
mae_dtr_test=mean_absolute_error(Y_test,Y_pred_dtr)
mae_dtr_train=mean_absolute_error(Y_train,Y_pred_train_dtr)

print("Testing MAE:",mae_dtr_test)
print("Training MAE:",mae_dtr_train)

Testing MAE: 2.721806666666667
Training MAE: 0.0


In [46]:
mse_test_dtr=mean_squared_error(Y_test,Y_pred_dtr)
mse_train_dtr=mean_squared_error(Y_train,Y_pred_train_dtr)

print("Testing MSE:",mse_test_dtr)
print("Training MSE:",mse_train_dtr)

Testing MSE: 12.580727853333332
Training MSE: 0.0


In [50]:
print("R2 score:",r2_score(Y_test,Y_pred_dtr))

R2 score: 0.5905229693584166


Fit DecisionTree Model with Hyperparameter Tuninng(GridSerach Method)

In [74]:
param_grid_dtr = {
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}
dtr_cv=GridSearchCV(estimator=dtr,param_grid=param_grid_dtr,cv=5,n_jobs=-1)

In [76]:
dtr_cv.fit(X_train,Y_train)

In [78]:
Y_pred_dtr_cv=dtr_cv.predict(X_test)
Y_pred_train_dtr_cv=dtr_cv.predict(X_train)

In [80]:
test_mae_dtr_cv=mean_absolute_error(Y_pred_dtr_cv,Y_test)
train_mae_dtr_cv=mean_absolute_error(Y_pred_train_dtr_cv,Y_train)

print("Testing MAE(HyperParameter Tuninng):",test_mae_dtr_cv)
print("Training MAE(Hyperparameter Tuninng):",train_mae_dtr_cv)

Testing MAE(HyperParameter Tuninng): 1.9893575869273616
Training MAE(Hyperparameter Tuninng): 1.9264854178863997


In [82]:
mse_test_dtr_cv=mean_squared_error(Y_pred_dtr_cv,Y_test)
mse_train_dtr_cv=mean_squared_error(Y_pred_train_dtr_cv,Y_train)

print("Testing Mean Square Error(Hyper Parameter Tuninng):",mse_test_dtr_cv)
print("Training Mean Sqaure Error(Hyper Parameter Tuninng):",mse_train_dtr_cv)

Testing Mean Square Error(Hyper Parameter Tuninng): 6.494926097722659
Training Mean Sqaure Error(Hyper Parameter Tuninng): 6.11992904225232


In [84]:
r_2Score_cv=r2_score(Y_test,Y_pred_dtr_cv)
print("R2 score(HyperParameter Tuninng):",r_2Score_cv)

R2 score(HyperParameter Tuninng): 0.7886034032579963
