In [None]:
Author: Ilia Kabanov

In [None]:
In this project we need to forecast medical insurance payments based on customers' personal data. In this file we are going 
to use KNN algorithm.

In [15]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

import warnings

warnings.filterwarnings("ignore")

In [16]:
# Read input data
train = pd.read_csv('train.csv', sep=',')
test = pd.read_csv('test.csv', sep=',')
test_ids = test["id"]
y_train = train["charges"]
X_train = train.drop(["charges"], axis=1, inplace=False)
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,id
0,43,male,26.03,0,no,northeast,1
1,58,female,28.215,0,no,northwest,2
2,53,male,31.35,0,no,southeast,3
3,54,male,29.2,1,no,southwest,4
4,19,male,34.9,0,yes,southwest,5


In [None]:
We have categorial features here, we shall use label encoding to transform them into numeric features.

In [17]:
# Label encoding
X_train['sex']=X_train['sex'].map({'male':0,'female':1})
X_train['smoker']=X_train['smoker'].map({'yes':1,'no':0})
X_train['region']=X_train['region'].map({'northwest':0, 'northeast':1,'southeast':2,'southwest':3})

test['sex']=test['sex'].map({'male':0,'female':1})
test['smoker']=test['smoker'].map({'yes':1,'no':0})
test['region']=test['region'].map({'northwest':0, 'northeast':1,'southeast':2,'southwest':3})
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,id
0,43,0,26.03,0,0,1,1
1,58,1,28.215,0,0,0,2
2,53,0,31.35,0,0,2,3
3,54,0,29.2,1,0,3,4
4,19,0,34.9,0,1,3,5


In [None]:
Now our dataset is ready to be fitted, but, at first, we need to tune hyperparameters of KNN model. Cross-validation will be 
undertaken to compare models' results.

In [18]:
# Hyperparameter tuning
# define search space
search_space = list()
search_space.append(Integer(5, 100, name='n_estimators'))
search_space.append(Integer(5, 15, name='max_depth'))
search_space.append(Integer(5, 20, name='min_samples_split'))
search_space.append(Integer(3, 10, name='min_samples_leaf'))
search_space.append(Integer(10, 100, name='max_leaf_nodes'))
search_space.append(Real(0.05, 0.3, name='min_impurity_decrease'))

# define the function needed to optimize
@use_named_args(search_space)
def cross_val_mean(**param):
    rf = RandomForestRegressor(**param, criterion='absolute_error', n_jobs=-1)
    acc = np.mean(cross_val_score(estimator=rf, X=X_train, y=y_train, scoring=make_scorer(mean_squared_log_error), cv=5))
    return np.sqrt(acc)


# perform optimization
result = gp_minimize(
    func=cross_val_mean,
    dimensions=search_space,
    n_calls=40,
    random_state=42,
    verbose=True
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 3.0010
Function value obtained: 0.3986
Current minimum: 0.3986
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.6110
Function value obtained: 0.4094
Current minimum: 0.3986
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.1300
Function value obtained: 0.4063
Current minimum: 0.3986
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2100
Function value obtained: 0.4330
Current minimum: 0.3986
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.2601
Function value obtained: 0.4017
Current minimum: 0.3986
Iteration No: 6 started. Evaluating

In [19]:
print('Best MSLE: %.3f' % (result.fun))
print('Best Parameters: %s' % (result.x))

Best MSLE: 0.395
Best Parameters: [100, 15, 5, 7, 100, 0.3]


In [None]:
Hopefully, current score is much better than one we obtained with KNN in the file 'KNN'

In [None]:
We got our model, now we can predict customers' payments from test dataset.

In [20]:
# Fit our best model on the whole train dataframe
n_estimators, max_depth, min_s_s, min_s_l, max_l_n, diff = result.x
rf = RandomForestRegressor(n_estimators=n_estimators,
                           max_depth=max_depth,
                           min_samples_split=min_s_s,
                           min_samples_leaf=min_s_l,
                           max_leaf_nodes=max_l_n,
                           min_impurity_decrease=diff,
                           criterion='absolute_error')

rf.fit(X=X_train, y=y_train)
pred_test = rf.predict(X=test)
submission = pd.DataFrame()
submission["id"] = test_ids
submission["charges"] = pred_test
submission.to_csv("submission_rf.csv", index=False)