In [115]:
# import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt

In [116]:
df = pd.read_csv("Dummy-Data.csv")

In [117]:
df

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate
0,56372,31,Male,510,185,
1,34565,35,Male,510,205,
2,57732,45,Female,510,125,
3,87324,38,Male,503,175,
4,12323,39,Female,600,252,
...,...,...,...,...,...,...
95,99511,35,Male,510,275,
96,23781,27,Male,604,145,
97,99517,35,Female,507,190,
98,99520,38,Female,510,144,


## Helper Function

In [118]:
def inch2meter(inch):
    return 0.0254 * inch

def ht2meter(height):
    ft = height // 100
    inch = height % 100

    return inch2meter(12 * ft + inch)

def pounds2kg(pound):
    return pound * 0.4535924

def get_quote(row):
    discount = 0.9 if row["Ins_Gender"] == "Female" else 1
    msg = ""
    if 18 <= row["Ins_Age"] <= 39 and (row["BMI"] < 17.49 or row["BMI"] > 38.5):
        quote = 750 * discount
        msg = "Age is between 18 to 39 and 'BMI' is either less than 17.49 or greater than 38.5"
    elif 40 <= row["Ins_Age"] <= 59 and (row["BMI"] < 18.49 or row["BMI"] > 38.5):
        quote = 1000 * discount
        msg = "Age is between 40 to 59 and 'BMI' is either less than 18.49 or greater then 38.5"
    elif row["Ins_Age"] >= 60 and (row["BMI"] < 18.49 or row["BMI"] > 45.5):
        quote = 2000 * discount
        msg = "Age is greater than 60 and 'BMI' is either less than 18.49 or greater than 38.5"
    else:
        quote = 500 * discount
        msg = "BMI is in right range"
    row["quote"] = quote
    row["reason"] = msg

    return row

## 1. Directly calculate BMI

In [119]:
df["Ht_meter"] = df["Ht"].apply(ht2meter)
df["Wt_kg"] = df["Wt"].apply(pounds2kg)

In [120]:
df["BMI"] = df["Wt_kg"] / (df["Ht_meter"] ** 2)

In [121]:
df = df.apply(get_quote, axis=1)

In [122]:
df

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,Ht_meter,Wt_kg,BMI,quote,reason
0,56372,31,Male,510,185,,1.7780,83.914594,26.544465,500.0,BMI is in right range
1,34565,35,Male,510,205,,1.7780,92.986442,29.414137,500.0,BMI is in right range
2,57732,45,Female,510,125,,1.7780,56.699050,17.935450,900.0,Age is between 40 to 59 and 'BMI' is either le...
3,87324,38,Male,503,175,,1.6002,79.378670,30.999543,500.0,BMI is in right range
4,12323,39,Female,600,252,,1.8288,114.305285,34.176996,450.0,BMI is in right range
...,...,...,...,...,...,...,...,...,...,...,...
95,99511,35,Male,510,275,,1.7780,124.737910,39.457989,750.0,Age is between 18 to 39 and 'BMI' is either le...
96,23781,27,Male,604,145,,1.9304,65.770898,17.649774,500.0,BMI is in right range
97,99517,35,Female,507,190,,1.7018,86.182556,29.757904,450.0,BMI is in right range
98,99520,38,Female,510,144,,1.7780,65.317306,20.661638,450.0,BMI is in right range


## 2. Build model to predict BMI

In [123]:
df = pd.read_csv("Dummy-Data.csv")

In [124]:
df.head()

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate
0,56372,31,Male,510,185,
1,34565,35,Male,510,205,
2,57732,45,Female,510,125,
3,87324,38,Male,503,175,
4,12323,39,Female,600,252,


In [125]:
df["IssueDate"].value_counts()

Series([], Name: IssueDate, dtype: int64)

-> `IssueDate` is a redundant column

In [126]:
df["Ins_Gender"].value_counts()

Male      63
Female    37
Name: Ins_Gender, dtype: int64

In [127]:
df["Wt"].describe()

count    100.00000
mean     165.15000
std       27.66224
min      110.00000
25%      145.00000
50%      170.00000
75%      180.00000
max      275.00000
Name: Wt, dtype: float64

In [128]:
df[df["Wt"] == 110]

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate
13,45432,47,Female,509,110,


In [129]:
df[df["Wt"] == 275]

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate
95,99511,35,Male,510,275,


-> all weights seem reasonable

In [131]:
# From pounds to kg
df["Wt_kg"] = df["Wt"].apply(pounds2kg)

Original height representation is not continuous -> change it to `meter` scale

In [132]:
df["Ht_meter"] = df["Ht"].apply(ht2meter)

In [133]:
df["BMI"] = df["Wt_kg"] / (df["Ht_meter"] ** 2)

In [134]:
df.head()

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,Wt_kg,Ht_meter,BMI
0,56372,31,Male,510,185,,83.914594,1.778,26.544465
1,34565,35,Male,510,205,,92.986442,1.778,29.414137
2,57732,45,Female,510,125,,56.69905,1.778,17.93545
3,87324,38,Male,503,175,,79.37867,1.6002,30.999543
4,12323,39,Female,600,252,,114.305285,1.8288,34.176996


Transform `Ins_Gender` column to `isMale` dummy column

In [135]:
df["isMale"] = pd.get_dummies(df["Ins_Gender"], drop_first=True)

In [136]:
scaler = MinMaxScaler()
df[["Ins_Age", "Ht_meter", "Wt"]] = scaler.fit_transform(df[["Ins_Age", "Ht_meter", "Wt"]])

In [137]:
cols = ["Ins_Age", "isMale", "Ht_meter", "Wt"]
X = df[cols]

In [138]:
y = df["BMI"]

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [140]:
X_train

Unnamed: 0,Ins_Age,isMale,Ht_meter,Wt
98,0.475,0,0.588235,0.206061
64,0.525,1,0.176471,0.424242
7,0.400,0,0.411765,0.606061
97,0.400,0,0.411765,0.484848
95,0.400,1,0.588235,1.000000
...,...,...,...,...
31,0.950,0,0.764706,0.363636
8,0.525,0,0.705882,0.272727
66,0.125,0,0.941176,0.242424
49,0.000,1,0.411765,0.060606


In [141]:
y_train

98    20.661638
64    31.885244
7     32.890314
97    29.757904
95    39.457989
        ...    
31    22.428568
8     21.021565
66    18.258387
49    18.794465
40    23.747895
Name: BMI, Length: 70, dtype: float64

Random Forest Model:

In [142]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [143]:
y_pred_train = rf_reg.predict(X_train)
y_pred_test = rf_reg.predict(X_test)

In [144]:
print("Train set MSE: {:.4f}".format(mean_squared_error(y_train, y_pred_train)))
print("Test set MSE: {:.4f}".format(mean_squared_error(y_test, y_pred_test)))

Train set MSE: 0.3162
Test set MSE: 1.1525


In [145]:
print(y_test.values[:5])
print(y_pred_test[:5])

[25.10435548 28.19169809 28.19169809 29.05246389 20.54257491]
[25.4376769  28.74229745 28.99471284 29.01757768 21.47888008]


## 3. Build model to predict BMI based on the BMI business rules

In [156]:
df = pd.read_csv("Dummy-Data.csv")

In [157]:
df["Ht_meter"] = df["Ht"].apply(ht2meter)
df["Wt_kg"] = df["Wt"].apply(pounds2kg)
df["BMI"] = df["Wt_kg"] / (df["Ht_meter"] ** 2)

In [158]:
df["isMale"] = pd.get_dummies(df["Ins_Gender"], drop_first=True)

In [159]:
df.head()

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,Ht_meter,Wt_kg,BMI,isMale
0,56372,31,Male,510,185,,1.778,83.914594,26.544465,1
1,34565,35,Male,510,205,,1.778,92.986442,29.414137,1
2,57732,45,Female,510,125,,1.778,56.69905,17.93545,0
3,87324,38,Male,503,175,,1.6002,79.37867,30.999543,1
4,12323,39,Female,600,252,,1.8288,114.305285,34.176996,0


To create the training dataset based on the BMI business rules, create dummy variables that reflect each rule

Rule 1: Age is between 18 and 39 and BMI less than 17.49 or greater than 38.5

In [160]:
def isRule1(row):
    return 1 if 18 <= row["Ins_Age"] <= 39 and (row["BMI"] < 17.39 or row["BMI"] > 38.5) else 0
df["rule1"] = df.apply(isRule1, axis = 1)

In [161]:
def isRule2(row):
    return 1 if 40 <= row["Ins_Age"] <= 59 and (row["BMI"] < 18.49 or row["BMI"] > 38.5) else 0
df["rule2"] = df.apply(isRule2, axis = 1)

In [162]:
def isRule3(row):
    return 1 if row["Ins_Age"] >= 60 and (row["BMI"] < 18.49 or row["BMI"] > 45.5) else 0
df["rule3"] = df.apply(isRule3, axis = 1)

In [163]:
df.head()

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,Ht_meter,Wt_kg,BMI,isMale,rule1,rule2,rule3
0,56372,31,Male,510,185,,1.778,83.914594,26.544465,1,0,0,0
1,34565,35,Male,510,205,,1.778,92.986442,29.414137,1,0,0,0
2,57732,45,Female,510,125,,1.778,56.69905,17.93545,0,0,1,0
3,87324,38,Male,503,175,,1.6002,79.37867,30.999543,1,0,0,0
4,12323,39,Female,600,252,,1.8288,114.305285,34.176996,0,0,0,0


Handcraft the three features `rule1`, `rule2` and `rule3` to incorporate the first 3 business rules for calculating BMI  

The fourth rule is encapsulated in the case where all `rule1`, `rule2` and `rule3` == 0

The fifth rule (Female 10% discount) is encapsulated in the `isMale` column

These 4 new features (`rule1`, `rule2`, `rule3` and `isMale` hopefully can improve the model performance, since these are the business rules for calculating BMI)

In [164]:
scaler = MinMaxScaler()
# Normalize `Ins_Age`, `Ht_meter`, `Wt`
df[["Ins_Age", "Ht_meter", "Wt"]] = scaler.fit_transform(df[["Ins_Age", "Ht_meter", "Wt"]])

In [165]:
cols = ["Ins_Age", "isMale", "Ht_meter", "Wt", "rule1", "rule2", "rule3"]
X = df[cols]
y = df["BMI"]

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [168]:
X_train.head()

Unnamed: 0,Ins_Age,isMale,Ht_meter,Wt,rule1,rule2,rule3
40,0.675,1,0.764706,0.424242,0,0,0
84,0.325,1,0.764706,0.090909,1,0,0
59,0.65,1,0.529412,0.424242,0,0,0
69,0.65,1,0.529412,0.30303,0,0,0
19,0.3,1,0.176471,0.424242,0,0,0


In [169]:
y_train.head()

40    23.747895
84    16.491594
59    26.581082
69    23.627629
19    31.885244
Name: BMI, dtype: float64

Random Forest Model

In [170]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [171]:
y_pred_train = rf_reg.predict(X_train)
y_pred_test = rf_reg.predict(X_test)

In [172]:
print("Train set MSE: {:.4f}".format(mean_squared_error(y_train, y_pred_train)))
print("Test set MSE: {:.4f}".format(mean_squared_error(y_test, y_pred_test)))

Train set MSE: 0.2795
Test set MSE: 2.1619


In [173]:
print(y_test.values[:5])
print(y_pred_test[:5])

[23.74789505 22.31524304 27.36862732 28.19169809 31.88524382]
[24.1862981  22.25911336 27.22110542 28.7258941  31.54089096]
