In [1]:
import pandas as pd
import numpy as np

In [2]:
f_path = 'C:\\Users\\USER\\Desktop\\FYP\\fyp-sandbox-2\\Data\\Reg_Model/clean_data_model_building_ready.csv'

In [3]:
#Import the data
df = pd.read_csv(f_path)
df

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg
0,6125174,1.58,2,1,2,10.0
1,6268158,13.00,5,1,0,68.8
2,6836653,18.00,5,1,6,55.0
3,7106038,13.00,5,1,0,68.8
4,7490493,12.00,4,1,2,31.6
...,...,...,...,...,...,...
20025,24380075,16.00,5,0,4,62.0
20026,24380097,12.00,4,0,1,39.0
20027,24380264,15.00,5,1,2,67.7
20028,24380424,10.00,4,1,4,28.0


In [4]:
#Making sure missing values are handled
df.isna().sum()

caseid                0
age_yrs               0
age_bin_enc           0
gender_enc            0
origin_country_enc    0
wt_kg                 0
dtype: int64

In [5]:
df.describe() #A general idea about the data we have

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg
count,20030.0,20030.0,20030.0,20030.0,20030.0,20030.0
mean,22808510.0,10.055168,3.884773,0.468547,10.791013,36.96008
std,1495906.0,5.419451,1.238942,0.499022,14.303206,19.676464
min,6125174.0,0.01,0.0,0.0,0.0,2.5
25%,22247010.0,5.0,3.0,0.0,4.0,20.0
50%,23087370.0,11.0,4.0,0.0,4.0,36.475
75%,23713430.0,15.0,5.0,1.0,9.0,52.6
max,24380950.0,18.0,5.0,1.0,129.0,80.92


## Building the Model

### Normalizing age values

In [6]:
#We can use the min-max scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

df['age_norm'] = scaler.fit_transform(df[['age_yrs']])
df

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg,age_norm
0,6125174,1.58,2,1,2,10.0,0.087271
1,6268158,13.00,5,1,0,68.8,0.722068
2,6836653,18.00,5,1,6,55.0,1.000000
3,7106038,13.00,5,1,0,68.8,0.722068
4,7490493,12.00,4,1,2,31.6,0.666481
...,...,...,...,...,...,...,...
20025,24380075,16.00,5,0,4,62.0,0.888827
20026,24380097,12.00,4,0,1,39.0,0.666481
20027,24380264,15.00,5,1,2,67.7,0.833241
20028,24380424,10.00,4,1,4,28.0,0.555309


In [7]:
df.describe()

Unnamed: 0,caseid,age_yrs,age_bin_enc,gender_enc,origin_country_enc,wt_kg,age_norm
count,20030.0,20030.0,20030.0,20030.0,20030.0,20030.0,20030.0
mean,22808510.0,10.055168,3.884773,0.468547,10.791013,36.96008,0.558375
std,1495906.0,5.419451,1.238942,0.499022,14.303206,19.676464,0.301248
min,6125174.0,0.01,0.0,0.0,0.0,2.5,0.0
25%,22247010.0,5.0,3.0,0.0,4.0,20.0,0.277376
50%,23087370.0,11.0,4.0,0.0,4.0,36.475,0.610895
75%,23713430.0,15.0,5.0,1.0,9.0,52.6,0.833241
max,24380950.0,18.0,5.0,1.0,129.0,80.92,1.0


In [9]:
#Let's define features and labels
X = df[['age_norm', 'age_bin_enc', 'gender_enc', 'origin_country_enc']]
y = df['wt_kg']

X.shape, y.shape

((20030, 4), (20030,))

In [10]:
#Splitting training and testing splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14021, 4), (6009, 4), (14021,), (6009,))

In [13]:
#Building a random forest regressor model
from sklearn.ensemble import RandomForestRegressor

np.random.seed(7)
reg = RandomForestRegressor()

reg.fit(X_train, y_train)

### Predictions and Evaluation

In [15]:
y_preds = reg.predict(X_test)
pd.DataFrame({'Actuals': y_test, 'Predictions': y_preds, 'Abs_Diff': abs(y_preds - y_test)})

Unnamed: 0,Actuals,Predictions,Abs_Diff
4743,66.000,58.355016,7.644984
8202,65.000,54.791428,10.208572
12285,48.000,38.321853,9.678147
1572,43.000,51.701785,8.701785
17535,45.100,60.441554,15.341554
...,...,...,...
18735,32.653,35.473553,2.820553
953,30.839,28.885849,1.953151
11286,60.600,53.919305,6.680695
511,9.200,9.426276,0.226276


In [16]:
#Evaluating model performance
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

r2 = r2_score(y_test, y_preds)
rmse = root_mean_squared_error(y_test, y_preds)
mae = mean_absolute_error(y_test, y_preds)
mape = mean_absolute_percentage_error(y_test, y_preds)

#Displaying in a dataframe
pd.DataFrame({'R2_Score': [r2], 'RMSE': [rmse], 'MAE': [mae], 'MAPE': [mape]})

Unnamed: 0,R2_Score,RMSE,MAE,MAPE
0,0.861031,7.378762,5.418025,0.148093
