In [1]:
import pandas as pd

train_x = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_train.csv')
train_y = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_train.csv')
test_x = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/x_test.csv')
test_y = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/MedicalCost/y_test.csv')

In [9]:
drop_columns = ['ID']
train_x_drop = train_x.drop(columns=drop_columns)
test_x_drop = test_x.drop(columns=drop_columns)

In [10]:
train_x_dummy = pd.get_dummies(train_x_drop)
test_x_dummy = pd.get_dummies(test_x_drop)[train_x_dummy.columns]

In [12]:
train_y_t = train_y['charges']
test_y_t = test_y['charges']

In [13]:
from sklearn.model_selection import train_test_split

X_t, X_v, y_t, y_v = train_test_split(train_x_dummy, train_y_t, test_size=0.33, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_t, y_t)

In [15]:
ptl = rfr.predict(X_t)
pvl = rfr.predict(X_v)

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import numpy as np

print(f'train mse: {mean_squared_error(y_t, ptl)}')
print(f'train mae: {mean_absolute_error(y_t, ptl)}')
print(f'train mape: {mean_absolute_percentage_error(y_t, ptl)}')
print(f'train rmse: {np.sqrt(mean_squared_error(y_t, ptl))}')
print(f'train r2 score: {r2_score(y_t, ptl)}\n')

print(f'validation mse: {mean_squared_error(y_v, pvl)}')
print(f'validation mae: {mean_absolute_error(y_v, pvl)}')
print(f'validation mape: {mean_absolute_percentage_error(y_v, pvl)}')
print(f'validation rmse: {np.sqrt(mean_squared_error(y_v, pvl))}')
print(f'validation r2 score: {r2_score(y_v, pvl)}\n')

train mse: 3383332.4826872167
train mae: 1042.4565806274447
train mape: 0.13273443244111194
train rmse: 1839.3837236115842
train r2 score: 0.9774806150695879

validation mse: 26059106.52525651
validation mae: 2878.8235404763
validation mape: 0.3339820771990118
validation rmse: 5104.812094999826
validation r2 score: 0.8223433103942744



In [17]:
p_test_l = rfr.predict(test_x_dummy)

print(f'test mse: {mean_squared_error(test_y_t, p_test_l)}')
print(f'test mae: {mean_absolute_error(test_y_t, p_test_l)}')
print(f'test mape: {mean_absolute_percentage_error(test_y_t, p_test_l)}')
print(f'test rmse: {np.sqrt(mean_squared_error(test_y_t, p_test_l))}')
print(f'test r2 score: {r2_score(test_y_t, p_test_l)}\n')

test mse: 10962658.431142868
test mae: 1570.2373463559697
test mape: 0.1617149210502886
test rmse: 3310.990551352098
test r2 score: 0.9287805842374488

