In [25]:
import pandas as pd
import numpy as np

In [26]:
f_path = 'C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\Reg_Model_Training/model_training_data.csv'

In [27]:
df = pd.read_csv(f_path)
df

Unnamed: 0,caseid,age_yrs,age_norm,age_bin_enc,gender_enc,origin_country_enc,wt_kg
0,18690315,9.00,0.499722,4,1,0,23.00
1,18690334,18.00,1.000000,5,0,1,61.80
2,18690432,14.00,0.777654,5,0,3,57.00
3,18690568,1.00,0.055031,2,1,1,10.00
4,18690589,0.25,0.013341,1,0,1,11.00
...,...,...,...,...,...,...,...
39650,24795513,16.00,0.888827,5,0,1,65.00
39651,24798894,12.00,0.666481,4,1,1,41.73
39652,24799324,16.00,0.888827,5,1,1,78.93
39653,24799665,0.17,0.008894,1,0,1,4.95


## Building the Model

### With all features

In [28]:
#Let's define features and labels
X = df[['age_norm', 'age_bin_enc', 'gender_enc', 'origin_country_enc']]
y = df['wt_kg']

X.shape, y.shape

((39655, 4), (39655,))

In [29]:
#Splitting training and testing splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27758, 4), (11897, 4), (27758,), (11897,))

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(7)
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5)
gbr.fit(X_train, y_train)

### Predictions and Evaluation

In [31]:
y_preds = gbr.predict(X_test)
pd.DataFrame({'Actuals': y_test, 'Predictions': y_preds, 'Abs_Diff': abs(y_preds - y_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
6721,24.00,20.473079,3.526921
25248,25.00,29.299713,4.299713
32212,49.00,59.298586,10.298586
32106,19.00,17.348846,1.651154
678,46.26,60.984443,14.724443
...,...,...,...
11960,34.00,31.812250,2.187750
10265,56.00,55.803413,0.196587
31819,70.00,68.443263,1.556737
30349,35.50,35.915466,0.415466


In [32]:
#Evaluating model performance
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

r2 = r2_score(y_test, y_preds)
rmse = root_mean_squared_error(y_test, y_preds)
mae = mean_absolute_error(y_test, y_preds)
mape = mean_absolute_percentage_error(y_test, y_preds)

#Displaying in a dataframe
all_fea = pd.DataFrame({'R2_Score': [r2], 'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape]})
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.860855,7.483023,5.54169,0.151341


### Training with only positively correlating features

In [9]:
#Let's define features and labels
X1 = df[['age_norm', 'age_bin_enc', 'gender_enc']]
y1 = df['wt_kg']

X1.shape, y1.shape

((39655, 3), (39655,))

In [10]:
#Splitting training and testing splits

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3)

X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((27758, 3), (11897, 3), (27758,), (11897,))

In [11]:
np.random.seed(67)
gbr_1 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5)
gbr_1.fit(X1_train, y1_train)

In [13]:
y1_preds = gbr_1.predict(X1_test)
pd.DataFrame({'Actuals': y1_test, 'Predictions': y1_preds, 'Abs_Diff': abs(y1_preds - y1_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
23383,31.05,28.899434,2.150566
27389,51.00,57.170874,6.170874
5631,38.00,33.426611,4.573389
1544,58.96,56.113777,2.846223
5992,59.00,62.257855,3.257855
...,...,...,...
15769,12.00,10.944047,1.055953
20805,16.70,16.822361,0.122361
30299,53.60,54.688136,1.088136
36128,50.00,62.257855,12.257855


In [14]:
#Evaluating model performance

r2_1 = r2_score(y1_test, y1_preds)
rmse_1 = root_mean_squared_error(y1_test, y1_preds)
mae_1 = mean_absolute_error(y1_test, y1_preds)
mape_1 = mean_absolute_percentage_error(y1_test, y1_preds)

#Displaying in a dataframe
corr_fea = pd.DataFrame({'R2_Score': [r2_1], 'RMSE': [rmse_1], 'MAE': [mae_1], 'MAPE': [mape_1]})
corr_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.860719,7.485856,5.51119,0.151164


In [15]:
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.860894,7.492715,5.524692,0.150428


In [20]:
cols = corr_fea.columns

for i in cols:
    if i == 'R2_Score':
        if(all_fea[i] < corr_fea[i]).all():
            print('Correlated R1 is higher (Great)')
        else:
            print('Correlated R1 is lower (Not Great)')
    else:
        if (all_fea[i] > corr_fea[i]).all():
            print(f'Correlated {i} is lower (Great)')
        else:
            print(f'Correlated {i} is higher (Not Great)')

Correlated R1 is lower (Not Great)
Correlated RMSE is lower (Great)
Correlated MAE is lower (Great)
Correlated MAPE is higher (Not Great)


In [21]:
#Exporting each dataframe with evaluation to compare between each model's
all_fea.to_csv('Eve_Metrics/gbr_all_fea.csv', index=False)
corr_fea.to_csv('Eve_Metrics/gbr_corr_fea.csv', index=False)

(Generated using ChatGPT for guidance and proof)

### Looking at the results:

* Scene 1 (All Features) has a slightly higher R² (0.860894 vs. 0.860719) → better model fit.
* Scene 2 (Correlated Features) has lower RMSE and MAE → better error performance.
* Scene 1 has lower MAPE → slightly better at percentage-based error.

### Decision:
Scene 2 (Correlated Features) is the better choice because:

* RMSE and MAE are more important than a tiny R² drop in real-world predictions.
* Dropping one feature simplifies the model and may reduce overfitting.
* The MAPE increase is minimal (0.150428 → 0.151164), so not a major concern.

In [23]:
#Saving the model
import pickle

with open("C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Scripts\\Demographic Scripts\\faers_transforms\\demog_wt_model/gbr_model.pkl", "wb") as f:
    pickle.dump(gbr_1, f)

In [34]:
import pickle

with open("C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Scripts\\Demographic Scripts\\demog_wt_model/gbr_model_1.pkl", "wb") as f:
    pickle.dump(gbr, f)