In [None]:
Author: Ilia Kabanov

In [None]:
In this project we need to forecast medical insurance payments based on customers' personal data. In this file we are going 
to use KNN algorithm.

In [58]:
# Import packages
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn import preprocessing
from copy import deepcopy

In [59]:
# Define RMSLE metric we will need to assess predictions
def RMSLE(y_true, y_pred):
    """
    The Root Mean Squared Log Error (RMSLE) metric

    :param y_true: The ground truth labels given in the dataset
    :param y_pred: Our predictions

    :return: The RMSLE score
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
Let's define here alternarive cross-validation procedure when model fitted with every fold of cv gives predictions concerning
test dataframe. Then we count average on these n_splits prediction models. 

In [60]:
def cv_and_predict(
        df_train,
        df_test,
        train_y,
        model,
        n_splits=5,
        random_state=42,
        metric=RMSLE
):
    """
    Функция для кросс-валидации и предикта на тест

    :param df_train: Трейн-датафрейм
    :param df_test: Тест-датафрейм
    :param train_y: Ответы на трейн
    :param model: Модель, которую мы хотим учить
    :param n_splits: Количество сплитов для KFold
    :param random_state: random_state для KFold

    :return: pred_test: Предсказания на тест; oof_df: OOF предсказания
    """

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    # В датафрейме oof_df будут храниться настоящий таргет трейна и OOF предсказания на трейн.
    # Инициализируем prediction_oof нулями и будем заполнять предсказаниями в процессе валидации
    oof_df = pd.DataFrame()
    oof_df["target"] = train_y
    oof_df["prediction_oof"] = np.zeros(oof_df.shape[0])

    # Список с метриками по фолдам
    metrics = list()


    # Кросс-валидация
    for i, (train_index, valid_index) in enumerate(kf.split(df_train, train_y)):

        X_train = df_train.loc[train_index]
        y_train = train_y.loc[train_index].values

        X_valid = df_train.loc[valid_index]
        y_valid = train_y.loc[valid_index].values

        model_kf = deepcopy(model)
        model_kf.fit(X_train, y_train)

        prediction = model_kf.predict(X_valid)

        cur_metric = metric(y_valid, prediction)
        metrics.append(cur_metric)

    return np.mean(metrics)

def kernel(array: np.ndarray):
    return np.array([(2*np.pi)**(-0.5)*np.e**(-2*x**2) for x in array])

In [61]:
# Read input data
train = pd.read_csv('train.csv', sep=',')
test = pd.read_csv('test.csv', sep=',')
test_ids = test["id"]
train_y = train["charges"]
train = train.drop(["charges"], axis=1, inplace=False)
train

Unnamed: 0,age,sex,bmi,children,smoker,region,id
0,43,male,26.030,0,no,northeast,1
1,58,female,28.215,0,no,northwest,2
2,53,male,31.350,0,no,southeast,3
3,54,male,29.200,1,no,southwest,4
4,19,male,34.900,0,yes,southwest,5
...,...,...,...,...,...,...,...
664,18,female,31.350,4,no,northeast,665
665,39,female,23.870,5,no,southeast,666
666,58,male,25.175,0,no,northeast,667
667,37,female,47.600,2,yes,southwest,668


In [None]:
We have categorial features here, we shall use label encoding to transform them into numeric features.

In [62]:
# Label encoding
train['sex']=train['sex'].map({'male':0,'female':1})
train['smoker']=train['smoker'].map({'yes':1,'no':0})
train['region']=train['region'].map({'northwest':0, 'northeast':1,'southeast':2,'southwest':3})

test['sex']=test['sex'].map({'male':0,'female':1})
test['smoker']=test['smoker'].map({'yes':1,'no':0})
test['region']=test['region'].map({'northwest':0, 'northeast':1,'southeast':2,'southwest':3})
train

Unnamed: 0,age,sex,bmi,children,smoker,region,id
0,43,0,26.030,0,0,1,1
1,58,1,28.215,0,0,0,2
2,53,0,31.350,0,0,2,3
3,54,0,29.200,1,0,3,4
4,19,0,34.900,0,1,3,5
...,...,...,...,...,...,...,...
664,18,1,31.350,4,0,1,665
665,39,1,23.870,5,0,2,666
666,58,0,25.175,0,0,1,667
667,37,1,47.600,2,1,3,668


In [None]:
Our features have various scales, we have to standartise their scales for KNN algorithm to work more accurately 

In [63]:
train.iloc[:, :] = preprocessing.StandardScaler().fit(train).transform(train)
test.iloc[:, :] = preprocessing.StandardScaler().fit(test).transform(test)

In [None]:
Now our dataset is ready to be fitted, but, at first, we need to tune hyperparameters of KNN model. Cross-validation will be 
undertaken to compare models' results.

In [64]:
scores = np.zeros(45*5).reshape(45, 5)
for k in range(6, 51):
    for p in range(1, 6):
        model_knn = KNeighborsRegressor(n_neighbors=k, n_jobs=4, p=p, weights=kernel)
        score = cv_and_predict(df_train = train, df_test = test, train_y = train_y, model = model_knn, n_splits=5)
        scores[k-6, p-1] = score

best_k, best_p = np.unravel_index(np.argmin(scores, axis=None), scores.shape)+np.array([6, 1])
print('Best parameters are equal to: {} and {}.'.format(best_k, best_p))
print('Best RMSLE score is equal to: {}.'.format(np.min(scores)))

Best parameters are equal to: 24 and 5.
Best RMSLE score is equal to: 0.5052068282289998.


In [None]:
Unfortunately, the score isn't good enough. We will try to improve it with random forest in the file 'RF'.

In [None]:
We got our model, now we can predict customers' payments from test dataset.

In [65]:
model = KNeighborsRegressor(n_neighbors=best_k, n_jobs=4, weights=kernel, p=best_p)
X_train, y_train = train, train_y
model.fit(X_train, y_train)
pred_test = model.predict(test)

submission = pd.DataFrame()
submission["id"] = test_ids
submission["charges"] = pred_test

submission.to_csv("submission_knn.csv", index=False)