In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv('../data/weight_change_dataset.csv')
df

Unnamed: 0,Participant ID,Age,Gender,Current Weight (lbs),BMR (Calories),Daily Calories Consumed,Daily Caloric Surplus/Deficit,Weight Change (lbs),Duration (weeks),Physical Activity Level,Sleep Quality,Stress Level,Final Weight (lbs)
0,1,56,M,228.4,3102.3,3916.0,813.7,0.20000,1,Sedentary,Excellent,6,228.6
1,2,46,F,165.4,2275.5,3823.0,1547.5,2.40000,6,Very Active,Excellent,6,167.8
2,3,32,F,142.8,2119.4,2785.4,666.0,1.40000,7,Sedentary,Good,3,144.2
3,4,25,F,145.5,2181.3,2587.3,406.0,0.80000,8,Sedentary,Fair,2,146.3
4,5,38,M,155.5,2463.8,3312.8,849.0,2.00000,10,Lightly Active,Good,1,157.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,59,M,181.9,2622.8,3656.2,1033.4,0.60000,2,Lightly Active,Excellent,3,182.5
96,97,56,F,116.5,1736.5,2756.5,1020.0,-8.92309,12,Moderately Active,Good,9,107.6
97,98,58,F,162.7,2188.6,3344.3,1155.8,3.60000,12,Lightly Active,Fair,7,166.3
98,99,45,M,179.8,2671.9,4000.0,1328.1,2.00000,5,Moderately Active,Fair,5,181.8


In [3]:
df['Sleep Quality'].value_counts()

Sleep Quality
Poor         38
Fair         24
Good         22
Excellent    16
Name: count, dtype: int64

In [4]:
df.isna().sum()

Participant ID                   0
Age                              0
Gender                           0
Current Weight (lbs)             0
BMR (Calories)                   0
Daily Calories Consumed          0
Daily Caloric Surplus/Deficit    0
Weight Change (lbs)              0
Duration (weeks)                 0
Physical Activity Level          0
Sleep Quality                    0
Stress Level                     0
Final Weight (lbs)               0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
df['Current Weight (lbs)'] = df['Current Weight (lbs)'].apply(lambda x: x/2.2).round(2)
df['Final Weight (lbs)'] = df['Final Weight (lbs)'].apply(lambda x: x/2.2).round(2)
df

Unnamed: 0,Participant ID,Age,Gender,Current Weight (lbs),BMR (Calories),Daily Calories Consumed,Daily Caloric Surplus/Deficit,Weight Change (lbs),Duration (weeks),Physical Activity Level,Sleep Quality,Stress Level,Final Weight (lbs)
0,1,56,M,103.82,3102.3,3916.0,813.7,0.20000,1,Sedentary,Excellent,6,103.91
1,2,46,F,75.18,2275.5,3823.0,1547.5,2.40000,6,Very Active,Excellent,6,76.27
2,3,32,F,64.91,2119.4,2785.4,666.0,1.40000,7,Sedentary,Good,3,65.55
3,4,25,F,66.14,2181.3,2587.3,406.0,0.80000,8,Sedentary,Fair,2,66.50
4,5,38,M,70.68,2463.8,3312.8,849.0,2.00000,10,Lightly Active,Good,1,71.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,59,M,82.68,2622.8,3656.2,1033.4,0.60000,2,Lightly Active,Excellent,3,82.95
96,97,56,F,52.95,1736.5,2756.5,1020.0,-8.92309,12,Moderately Active,Good,9,48.91
97,98,58,F,73.95,2188.6,3344.3,1155.8,3.60000,12,Lightly Active,Fair,7,75.59
98,99,45,M,81.73,2671.9,4000.0,1328.1,2.00000,5,Moderately Active,Fair,5,82.64


In [15]:
X = df.drop(columns=['Participant ID','Weight Change (lbs)','Final Weight (lbs)'])
y = df['Final Weight (lbs)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 10), (20, 10), (80,), (20,))

In [16]:
models = {
    "Linear Regression": LinearRegression(fit_intercept=False),
    "Random Forest": RandomForestRegressor(),
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGB": XGBRegressor()
}

In [17]:
num_cols = X.select_dtypes(include=['float64','int64'],exclude='object').columns
cat_cols = X.select_dtypes(exclude=['float64','int64'],include='object').columns

print(num_cols,cat_cols)

Index(['Age', 'Current Weight (lbs)', 'BMR (Calories)',
       'Daily Calories Consumed', 'Daily Caloric Surplus/Deficit',
       'Duration (weeks)', 'Stress Level'],
      dtype='object') Index(['Gender', 'Physical Activity Level', 'Sleep Quality'], dtype='object')


In [18]:
name_list =[]
r2 = []
mae_score = []
mse_score = []

# acc = []

for name, model in models.items():
    num_pipe = Pipeline([
        ('scl',StandardScaler()),
    ])
    
    cat_pipe = Pipeline([
        ('enc',OneHotEncoder()),
    ])
    
    transformer = ColumnTransformer([
        ('num',num_pipe,num_cols),
        ('cat',cat_pipe,cat_cols)
    ])
    
    pipeline = Pipeline([
        ("trf", transformer), 
        ("classifier", model)
    ])
    
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    score_r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    
    name_list.append(name)
    r2.append(score_r2)
    mae_score.append(mae)
    mse_score.append(mse)
#     acc.append(acc_score)
    
#     print(model)
#     print(classification_report(y_test,y_pred))
    
selection = pd.DataFrame({
    "Model": name_list,
    "R2": r2,
    "MAE": mae_score,
    "MSE": mse_score,
#     "Accuracy Score": acc,
})

selection

Unnamed: 0,Model,R2,MAE,MSE
0,Linear Regression,0.982512,1.75028,4.480537
1,Random Forest,0.95998,2.632775,10.253578
2,SVM,0.202677,11.374511,204.282392
3,Decision Tree,0.899839,3.731,25.66228
4,XGB,0.975558,1.974206,6.262297
