In [1]:
import pandas as pd
import numpy as np

In [2]:
f_path = 'C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\Reg_Model/clean_data_model_building_ready.csv'

In [3]:
df = pd.read_csv(f_path)
df

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg
0,6125174,1.58,2,1,2,10.0
1,6268158,13.00,5,1,0,68.8
2,6836653,18.00,5,1,6,55.0
3,7106038,13.00,5,1,0,68.8
4,7490493,12.00,4,1,2,31.6
...,...,...,...,...,...,...
20025,24380075,16.00,5,0,4,62.0
20026,24380097,12.00,4,0,1,39.0
20027,24380264,15.00,5,1,2,67.7
20028,24380424,10.00,4,1,4,28.0


## Building the Model

### Normalizing age values

In [4]:
#We can use the min max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

df['age_norm'] = scaler.fit_transform(df[['age_yrs']])
df

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg,age_norm
0,6125174,1.58,2,1,2,10.0,0.087271
1,6268158,13.00,5,1,0,68.8,0.722068
2,6836653,18.00,5,1,6,55.0,1.000000
3,7106038,13.00,5,1,0,68.8,0.722068
4,7490493,12.00,4,1,2,31.6,0.666481
...,...,...,...,...,...,...,...
20025,24380075,16.00,5,0,4,62.0,0.888827
20026,24380097,12.00,4,0,1,39.0,0.666481
20027,24380264,15.00,5,1,2,67.7,0.833241
20028,24380424,10.00,4,1,4,28.0,0.555309


In [5]:
#Let's define features and labels
X = df[['age_norm', 'age_bin_enc', 'gender_enc', 'origin_country_enc']]
y = df['wt_kg']

X.shape, y.shape

((20030, 4), (20030,))

In [6]:
#Splitting training and testing splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14021, 4), (6009, 4), (14021,), (6009,))

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(7)
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5)
gbr.fit(X_train, y_train)

### Predictions and Evaluation

In [8]:
y_preds = gbr.predict(X_test)
pd.DataFrame({'Actuals': y_test, 'Predictions': y_preds, 'Abs_Diff': abs(y_preds - y_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
11529,46.000,57.442234,11.442234
18326,48.000,53.841003,5.841003
15095,20.000,34.328278,14.328278
4725,23.900,26.927103,3.027103
19343,25.714,37.779436,12.065436
...,...,...,...
19145,20.500,27.725231,7.225231
5052,9.000,10.913872,1.913872
12250,16.330,13.055354,3.274646
12718,54.000,58.257231,4.257231


In [9]:
#Evaluating model performance
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

r2 = r2_score(y_test, y_preds)
rmse = root_mean_squared_error(y_test, y_preds)
mae = mean_absolute_error(y_test, y_preds)
mape = mean_absolute_percentage_error(y_test, y_preds)

#Displaying in a dataframe
pd.DataFrame({'R2_Score': [r2], 'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape]})

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.858918,7.393366,5.424022,0.149452
