In [20]:
import pandas as pd
import numpy as np

In [21]:
f_path = 'C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\Reg_Model_Training/model_training_data.csv'

In [22]:
#Import the data
df = pd.read_csv(f_path)
df

Unnamed: 0,caseid,age_yrs,age_norm,age_bin_enc,gender_enc,origin_country_enc,wt_kg
0,18690315,9.00,0.499722,4,1,0,23.00
1,18690334,18.00,1.000000,5,0,1,61.80
2,18690432,14.00,0.777654,5,0,3,57.00
3,18690568,1.00,0.055031,2,1,1,10.00
4,18690589,0.25,0.013341,1,0,1,11.00
...,...,...,...,...,...,...,...
39650,24795513,16.00,0.888827,5,0,1,65.00
39651,24798894,12.00,0.666481,4,1,1,41.73
39652,24799324,16.00,0.888827,5,1,1,78.93
39653,24799665,0.17,0.008894,1,0,1,4.95


In [4]:
#Making sure missing values are handled
df.isna().sum()

caseid                0
age_yrs               0
age_norm              0
age_bin_enc           0
gender_enc            0
origin_country_enc    0
wt_kg                 0
dtype: int64

In [6]:
df.describe() #A general idea about the data we have

Unnamed: 0,caseid,age_yrs,age_norm,age_bin_enc,gender_enc,origin_country_enc,wt_kg
count,39655.0,39655.0,39655.0,39655.0,39655.0,39655.0,39655.0
mean,21914610.0,10.137866,0.562972,3.891993,0.476913,8.312016,37.469403
std,1786524.0,5.480704,0.304653,1.245443,0.499473,12.408632,20.092133
min,18690320.0,0.01,0.0,0.0,0.0,0.0,2.5
25%,20373480.0,5.0,0.277376,3.0,0.0,1.0,20.0
50%,22034380.0,11.0,0.610895,4.0,0.0,2.0,37.2
75%,23511070.0,15.0,0.833241,5.0,1.0,16.0,54.0
max,24799780.0,18.0,1.0,5.0,1.0,146.0,80.92


## Building the Model

### Training with all Features

In [23]:
#Let's define features and labels
X = df[['age_norm', 'age_bin_enc', 'gender_enc', 'origin_country_enc']]
y = df['wt_kg']

X.shape, y.shape

((39655, 4), (39655,))

In [24]:
#Splitting training and testing splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27758, 4), (11897, 4), (27758,), (11897,))

In [25]:
#Building a random forest regressor model
from sklearn.ensemble import RandomForestRegressor

np.random.seed(7)
reg = RandomForestRegressor()

reg.fit(X_train, y_train)

### Predictions and Evaluation

In [26]:
y_preds = reg.predict(X_test)
pd.DataFrame({'Actuals': y_test, 'Predictions': y_preds, 'Abs_Diff': abs(y_preds - y_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
33714,65.000,53.809240,11.190760
36714,40.000,60.533727,20.533727
23740,21.400,23.621323,2.221323
28626,46.712,60.244149,13.532149
33378,65.000,61.059503,3.940497
...,...,...,...
29889,13.100,10.728600,2.371400
8863,12.400,14.141658,1.741658
8471,34.000,27.941406,6.058594
1651,80.000,65.315658,14.684342


In [27]:
#Evaluating model performance
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

r2 = r2_score(y_test, y_preds)
rmse = root_mean_squared_error(y_test, y_preds)
mae = mean_absolute_error(y_test, y_preds)
mape = mean_absolute_percentage_error(y_test, y_preds)

#Displaying in a dataframe
all_fea = pd.DataFrame({'R2_Score': [r2], 'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape]})
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.857234,7.652533,5.56597,0.15029


In [28]:
import pickle

with open("C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Scripts\\Demographic Scripts\\demog_wt_model/rf_all.pkl", "wb") as f:
    pickle.dump(reg, f)

### Training with only positively correlated features

In [10]:
X1 = df[['age_norm', 'age_bin_enc', 'gender_enc']]
y1 = df['wt_kg']

X1.shape, y1.shape

((39655, 3), (39655,))

In [11]:
#Splitting training and testing splits
X2_train, X2_test, y2_train, y2_test = train_test_split(X1, y1, test_size = 0.3)

X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

((27758, 3), (11897, 3), (27758,), (11897,))

In [12]:
#Building a random forest regressor model
np.random.seed(69)
reg_3 = RandomForestRegressor()

reg_3.fit(X2_train, y2_train)

In [14]:
# Predictions and evaluation
y2_preds = reg_3.predict(X2_test)
pd.DataFrame({'Actuals': y2_test, 'Predictions': y2_preds, 'Abs_Diff': abs(y2_preds - y2_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
2740,56.69,56.416263,0.273737
18842,74.00,62.456657,11.543343
14242,39.00,54.650658,15.650658
2325,58.90,62.337594,3.437594
10416,53.90,58.431794,4.531794
...,...,...,...
11774,21.00,35.585903,14.585903
37550,50.00,37.373785,12.626215
8912,49.00,58.431794,9.431794
31586,54.43,56.416263,1.986263


In [15]:
#Evaluating performance

r2_1 = r2_score(y2_test, y2_preds)
rmse_1 = root_mean_squared_error(y2_test, y2_preds)
mae_1 = mean_absolute_error(y2_test, y2_preds)
mape_1 = mean_absolute_percentage_error(y2_test, y2_preds)

#Displaying in a dataframe
corr_fea = pd.DataFrame({'R2_Score': [r2_1], 'RMSE': [rmse_1], 'MAE': [mae_1], 'MAPE': [mape_1]})
corr_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.85881,7.515114,5.542479,0.151933


In [16]:
all_fea

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.856559,7.57849,5.565779,0.150137


There's only a slight performance boost obtained by only selecting all features 

In [17]:
cols = corr_fea.columns

for i in cols:
    if i == 'R2_Score':
        if(all_fea[i] < corr_fea[i]).all():
            print('Correlated R1 is higher (Great)')
        else:
            print('Correlated R1 is lower (Not Great)')
    else:
        if (all_fea[i] > corr_fea[i]).all():
            print(f'Correlated {i} is lower (Great)')
        else:
            print(f'Correlated {i} is higher (Not Great)')

Correlated R1 is higher (Great)
Correlated RMSE is lower (Great)
Correlated MAE is lower (Great)
Correlated MAPE is higher (Not Great)


In [19]:
#Exporting each dataframe with evaluation to compare between each model's
all_fea.to_csv('Eve_Metrics/random_forest_all_fea.csv', index=False)
corr_fea.to_csv('Eve_Metrics/random_forest_corr_fea.csv', index=False)