In [16]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [17]:
# Load the HR data 

exercise_data = pd.read_csv(r"exercise.csv", header=0)

# Copy to back-up file

exercise_data_bk = exercise_data.copy()

# Display first 5 records

exercise_data.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [18]:
calories = pd.read_csv(r"calories.csv", header=0)
calories.head()

Unnamed: 0,User_ID,Calories
0,14733363,231.0
1,14861698,66.0
2,11179863,26.0
3,16180408,71.0
4,17771927,35.0


In [19]:
hrdata = pd.concat([exercise_data, calories['Calories']], axis=1)
hrdata.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


In [20]:
# Display the dataset information

hrdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Gender      15000 non-null  object 
 2   Age         15000 non-null  int64  
 3   Height      15000 non-null  float64
 4   Weight      15000 non-null  float64
 5   Duration    15000 non-null  float64
 6   Heart_Rate  15000 non-null  float64
 7   Body_Temp   15000 non-null  float64
 8   Calories    15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB


In [21]:
hrdata.isnull().sum()

User_ID       0
Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [22]:
hrdata['Gender'].value_counts()

female    7553
male      7447
Name: Gender, dtype: int64

In [23]:
hrdata['Gender'] = hrdata['Gender'].str.replace('female', '0')
hrdata['Gender'] = hrdata['Gender'].str.replace('male', '1')
hrdata['Gender'] = hrdata['Gender'].astype(int)

In [24]:
hrdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Gender      15000 non-null  int32  
 2   Age         15000 non-null  int64  
 3   Height      15000 non-null  float64
 4   Weight      15000 non-null  float64
 5   Duration    15000 non-null  float64
 6   Heart_Rate  15000 non-null  float64
 7   Body_Temp   15000 non-null  float64
 8   Calories    15000 non-null  float64
dtypes: float64(6), int32(1), int64(2)
memory usage: 996.2 KB


In [25]:
# Displaying Duplicate values with in Loan ataset, if avialble

hrdata_dup = hrdata[hrdata.duplicated(keep='last')]
hrdata_dup

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories


In [26]:
del hrdata['User_ID']

In [27]:
# Identify the Independent and Target variables

IndepVar = []
for col in hrdata.columns:
    if col != 'Calories':
        IndepVar.append(col)

TargetVar = 'Calories'

x = hrdata[IndepVar]
y = hrdata[TargetVar]

In [28]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((10500, 7), (4500, 7), (10500,), (4500,))

In [41]:
# Load the result dataset

RGRResults = pd.read_csv(r"C:\Users\bvst2\Downloads\CaloriesRes.csv", header=0)
RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score,Mean_Squared_Error_MSE.1,Root_Mean_Squared_Log_Error_RMSLE.1,R2_score.1


In [42]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelSVR = SVR()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()

modelGBR = GradientBoostingRegressor()

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelGBR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 8.452
Mean Squared Error (MSE): 132.919
Root Mean Squared Error (RMSE): 11.529
R2_score: 0.966328
Root Mean Squared Log Error (RMSLE): 2.445
Mean Absolute Percentage Error (MAPE): 28.8 %
Adj R Square:  0.966312
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 3.488
Mean Squared Error (MSE): 29.176
Root Mean Squared Error (RMSE): 5.401
R2_score: 0.992609
Root Mean Squared Log Error (RMSLE): 1.687
Mean Absolute Percentage Error (MAPE): 4.8 %
Adj R Square:  0.992606
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 1.836
Mean Squared Error (MSE): 8.793
Root Mean Squared Error (RMSE): 2.965
R2_score: 0.997773
Root Mean Squared Log Error (RMSLE): 1.087
Mean Absolute Percentage Error (MAPE):

In [43]:
# Results with comparing the all the algorithms 

RGRResults.to_csv("C:\\Users\\bvst2\\Downloads\\CaloriesRes.csv")

RGRResults.head(15)

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score,Mean_Squared_Error_MSE.1,Root_Mean_Squared_Log_Error_RMSLE.1,R2_score.1
0,LinearRegression(),8.451946,0.966312,11.529055,28.795999,132.919116,2.44487,0.966328,,,
1,DecisionTreeRegressor(),3.488222,0.992606,5.401461,4.796778,29.175778,1.686669,0.992609,,,
2,"(DecisionTreeRegressor(max_features='auto', ra...",1.83572,0.997772,2.965261,2.813255,8.792775,1.086965,0.997773,,,
3,SVR(),11.3907,0.929416,16.688156,27.316621,278.494544,2.814699,0.929449,,,
4,"XGBRegressor(base_score=None, booster=None, ca...",1.544901,0.998719,2.24763,2.604727,5.051839,0.809876,0.99872,,,
5,KNeighborsRegressor(),5.193822,0.986349,7.339204,12.234209,53.86392,1.99323,0.986355,,,
6,"(ExtraTreeRegressor(random_state=704224597), E...",1.519202,0.99852,2.416269,2.365131,5.838358,0.882225,0.998521,,,
7,([DecisionTreeRegressor(criterion='friedman_ms...,2.706818,0.996424,3.755813,6.361846,14.106131,1.323305,0.996426,,,


In [44]:
import pickle
pickle.dump(modelGBR,open('gb_model.pkl','wb'))