In [1]:
import pandas as pd
import numpy as np

In [2]:
f_path = 'C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\Reg_Model_Training/model_training_data.csv'

In [3]:
df = pd.read_csv(f_path)
df

Unnamed: 0,caseid,age_yrs,age_norm,age_bin_enc,gender_enc,origin_country_enc,wt_kg
0,18690315,9.00,0.499722,4,1,0,23.00
1,18690334,18.00,1.000000,5,0,1,61.80
2,18690432,14.00,0.777654,5,0,3,57.00
3,18690568,1.00,0.055031,2,1,1,10.00
4,18690589,0.25,0.013341,1,0,1,11.00
...,...,...,...,...,...,...,...
39650,24795513,16.00,0.888827,5,0,1,65.00
39651,24798894,12.00,0.666481,4,1,1,41.73
39652,24799324,16.00,0.888827,5,1,1,78.93
39653,24799665,0.17,0.008894,1,0,1,4.95


## Building the Model

### With all features

In [4]:
#Let's define features and labels
X = df[['age_norm', 'age_bin_enc', 'gender_enc', 'origin_country_enc']]
y = df['wt_kg']

X.shape, y.shape

((39655, 4), (39655,))

In [5]:
#Splitting training and testing splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27758, 4), (11897, 4), (27758,), (11897,))

In [6]:
from xgboost import XGBRegressor

np.random.seed(7)
xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)

### Predictions and Evaluation

In [7]:
y_pred_xgb = xgb.predict(X_test)
pd.DataFrame({'Actuals': y_test, 'Predictions': y_pred_xgb, 'Abs_Diff': abs(y_pred_xgb - y_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
38653,23.00,36.592236,13.592236
11720,52.00,53.448780,1.448780
22354,9.00,11.362504,2.362504
29441,68.00,55.251205,12.748795
3992,23.00,33.732407,10.732407
...,...,...,...
17981,53.00,54.456486,1.456486
37998,9.00,7.993308,1.006692
21868,40.82,35.702301,5.117699
26104,13.30,14.104591,0.804591


In [8]:
#Evaluating model performance
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

r2 = r2_score(y_test, y_pred_xgb)
rmse = root_mean_squared_error(y_test, y_pred_xgb)
mae = mean_absolute_error(y_test, y_pred_xgb)
mape = mean_absolute_percentage_error(y_test, y_pred_xgb)

#Displaying in a dataframe
all_fea = pd.DataFrame({'R2_Score': [r2], 'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape]})
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.861578,7.485836,5.544976,0.151481


### Training only with Positively correlated features

In [9]:
#Let's define features and labels
X1 = df[['age_norm', 'age_bin_enc', 'gender_enc']]
y1 = df['wt_kg']

X1.shape, y1.shape

((39655, 3), (39655,))

In [10]:
#Splitting training and testing splits
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3)

X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((27758, 3), (11897, 3), (27758,), (11897,))

In [11]:
np.random.seed(57)
xgb_1 = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
xgb_1.fit(X1_train, y1_train)

In [13]:
y1_pred_xgb = xgb_1.predict(X1_test)
pd.DataFrame({'Actuals': y1_test, 'Predictions': y1_pred_xgb, 'Abs_Diff': abs(y1_pred_xgb - y1_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
19261,14.970,14.429619,0.540381
5995,35.000,33.313099,1.686901
32308,53.520,54.709866,1.189866
36690,9.977,10.941422,0.964422
39435,50.000,62.249271,12.249271
...,...,...,...
32014,20.000,20.246136,0.246136
37618,9.000,6.276295,2.723705
2033,10.000,8.668760,1.331240
38055,24.750,27.288929,2.538929


In [14]:
#Evaluating model performance
r2_1 = r2_score(y1_test, y1_pred_xgb)
rmse_1 = root_mean_squared_error(y1_test, y1_pred_xgb)
mae_1 = mean_absolute_error(y1_test, y1_pred_xgb)
mape_1 = mean_absolute_percentage_error(y1_test, y1_pred_xgb)

#Displaying in a dataframe
corr_fea = pd.DataFrame({'R2_Score': [r2_1], 'RMSE': [rmse_1], 'MAE': [mae_1], 'MAPE': [mape_1]})
corr_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.859906,7.500193,5.53027,0.151128


In [15]:
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.861578,7.485836,5.544976,0.151481


In [16]:
cols = corr_fea.columns

for i in cols:
    if i == 'R2_Score':
        if(all_fea[i] < corr_fea[i]).all():
            print('Correlated R1 is higher (Great)')
        else:
            print('Correlated R1 is lower (Not Great)')
    else:
        if (all_fea[i] > corr_fea[i]).all():
            print(f'Correlated {i} is lower (Great)')
        else:
            print(f'Correlated {i} is higher (Not Great)')

Correlated R1 is lower (Not Great)
Correlated RMSE is higher (Not Great)
Correlated MAE is lower (Great)
Correlated MAPE is lower (Great)


In [17]:
#Exporting each dataframe with evaluation to compare between each model's
all_fea.to_csv('Eve_Metrics/xgb_all_fea.csv', index=False)
corr_fea.to_csv('Eve_Metrics/xgb_corr_fea.csv', index=False)

In [18]:
import pickle

with open("C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Scripts\\Demographic Scripts\\demog_wt_model/xgb_model.pkl", "wb") as f:
    pickle.dump(xgb, f)