# Loading libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_rows', 200)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  # better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

# Loading & cleaning data

In [None]:
# read data
cs_df = pd.read_csv("Data.csv")
cs_df
df2 = cs_df.copy()
df2

In [None]:
def preprocessing():

    df2.drop(['Unnamed: 0'],axis=1, inplace=True)
    df2.drop(['Customer'],axis=1, inplace=True)
    df2.drop(['Number of Open Complaints'],axis=1, inplace=True)

    #replace 0 for "Income" column with missing values
    df2["Income"].replace(0, np.nan, inplace=True)

    #drop missing values and duplicates from entire dataframe

    df2.dropna(inplace = True)

    df2.drop_duplicates(inplace = True)
    #remove outliers from "Monthly Premium Auto"

    Q1 = df2["Monthly Premium Auto"].quantile(0.25)
    Q3 = df2["Monthly Premium Auto"].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 

    filter = (df2["Monthly Premium Auto"] >= Q1 - 1.5 * IQR) & (cs_df["Monthly Premium Auto"] <= Q3 + 1.5 *IQR)
    df2 = df2.loc[filter]

    #remove outliers from "Customer Lifetime Value"

    Q1 = df2["Customer Lifetime Value"].quantile(0.25)
    Q3 = df2["Customer Lifetime Value"].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 

    filter = (df2["Customer Lifetime Value"] >= Q1 - 1.5 * IQR) & (cs_df["Customer Lifetime Value"] <= Q3 + 1.5 *IQR)
    df2 = df2.loc[filter]
    
    return df2

## Model evaluation

In [None]:
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor # Import KNeighborsClassifier to use the K-NN for classification
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

trans = PowerTransformer()
model = LinearRegression()

# X/Y Split target variable: 'Total Claim Amount'
X=df2.drop('Total Claim Amount', axis=1)
y=np.log(df2['Total Claim Amount'])


# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [None]:
# transforming data
trans.fit(X_train)

X_train = trans.transform(X_train)
X_test  = trans.transform(X_test)

X_train = pd.DataFrame(X_train, columns = X.columns)
X_test  = pd.DataFrame(X_test,  columns = X.columns)

In [None]:
#reset indices for y_train and y_test
y_train =y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# split x_train and x_test by numerical and categorical
X_train_numerical = X_train._get_numeric_data()
X_test_numerical = X_test._get_numeric_data()
X_train_categorical = X_train.select_dtypes("object")
X_test_categorical = X_test.select_dtypes("object")

#encode X_train_categorical
X_train_categorical=pd.get_dummies(X_train_categorical, drop_first=True)
pd.DataFrame(OneHotEncoder(drop='first').fit_transform(X_train_categorical).toarray(),
columns=OneHotEncoder(drop='first').fit(X_train_categorical).get_feature_names(input_features=X_train_categorical.columns)).head()

#encode X_test_categorical
X_test_categorical=pd.get_dummies(X_test_categorical, drop_first=True)
pd.DataFrame(OneHotEncoder(drop='first').fit_transform(X_test_categorical).toarray(),
columns=OneHotEncoder(drop='first').fit(X_test_categorical).get_feature_names(input_features=X_test_categorical.columns)).head()

# model
model.fit(X_train_categorical, y_train)

y_pred_train_lm = model.predict(X_train_categorical)
y_pred_test_lm  = model.predict(X_test_categorical)

In [None]:
def model_performance(y_train, y_pred_train, y_test, y_pred_test):

    ME_train = np.mean(np.exp(y_train)-np.exp(y_pred_train))
    ME_test  = np.mean(np.exp(y_test)-np.exp(y_pred_test))

    MAE_train = mean_absolute_error(np.exp(y_train),np.exp(y_pred_train))
    MAE_test  = mean_absolute_error(np.exp(y_test),np.exp(y_pred_test))

    MSE_train = mean_squared_error(np.exp(y_train),np.exp(y_pred_train))
    MSE_test  = mean_squared_error(np.exp(y_test),np.exp(y_pred_test))

    RMSE_train = np.sqrt(MSE_train)
    RMSE_test  = np.sqrt(MSE_test)

    MAPE_train = np.mean((np.abs(np.exp(y_train)-np.exp(y_pred_train)) / np.exp(y_train))* 100.)
    MAPE_test  = np.mean((np.abs(np.exp(y_test)-np.exp(y_pred_test)) / np.exp(y_test))* 100.)

    R2_train = r2_score(np.exp(y_train),np.exp(y_pred_train))
    R2_test  = r2_score(np.exp(y_test),np.exp(y_pred_test))

    performance = pd.DataFrame({'Error_metric': ['Mean error','Mean absolute error','Mean squared error',
                                             'Root mean squared error','Mean absolute percentual error',
                                             'R2'],
                            'Train': [ME_train, MAE_train, MSE_train, RMSE_train, MAPE_train, R2_train],
                            'Test' : [ME_test, MAE_test , MSE_test, RMSE_test, MAPE_test, R2_test]})

    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': np.exp(y_train), 'Predicted': np.exp(y_pred_train)})
    df_test  = pd.DataFrame({'Real': np.exp(y_test),  'Predicted': np.exp(y_pred_test)})

    return performance, df_train, df_test

## Modelling

### Evaluating the model performance

In [None]:
performance_lm, _, _ = model_performance(y_train, y_pred_train_lm, y_test, y_pred_test_lm)
performance_lm

### KNN

In [None]:
# initialize model (set parameters)
neigh = KNeighborsRegressor(n_neighbors=3) # n_neighbors = K

In [None]:
neigh.fit(X_train, y_train) # Minkowski distance with p = 2 -> Euclidean distance

## Making predictions

In [None]:
# make predictions
y_pred_train_knn = neigh.predict(X_train)
y_pred_test_knn  = neigh.predict(X_test)

performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn)
performance_knn

In [None]:
# for loop to try many values of k

full = pd.DataFrame()

models = {'k': [] }

for k in range(2,21):

    neigh = KNeighborsRegressor(n_neighbors=k)
    neigh.fit(X_train, y_train)

    models['k'] = [k, neigh]

    y_pred_train_knn = neigh.predict(X_train)
    y_pred_test_knn  = neigh.predict(X_test)

    performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn)
    temp = pd.DataFrame({'k': [k]*6, 'Error_metric': performance_knn['Error_metric'], 
                         'Train': performance_knn['Train'], 'Test': performance_knn['Test']})
    full = pd.concat([full,temp], axis=0)

full

## Checking for overfitting

In [None]:
full[full['Error_metric'] == 'R2']

In [None]:
full2 = full.melt(id_vars=['k','Error_metric'])
full2

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#metrics = ['Mean error',]'Mean absolute error',...]

fig, ax = plt.subplots(2,3, figsize=(20,10))
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean error'], hue = 'variable', ax = ax[0,0])
ax[0,0].set_xticks(range(2,21))
ax[0,0].set_title("Mean error")
ax[0,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean absolute error'], hue = 'variable', ax = ax[0,1])
ax[0,1].set_xticks(range(2,21))
ax[0,1].set_title("Mean absolute error")
ax[0,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean squared error'], hue = 'variable', ax = ax[0,2])
ax[0,2].set_xticks(range(2,21))
ax[0,2].set_title("Mean squared error")
ax[0,2].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Root mean squared error'], hue = 'variable', ax = ax[1,0])
ax[1,0].set_xticks(range(2,21))
ax[1,0].set_title("Root mean squared error")
ax[1,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'Mean absolute percentual error'], hue = 'variable', ax = ax[1,1])
ax[1,1].set_xticks(range(2,21))
ax[1,1].set_title("Mean absolute percentual error")
ax[1,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = full2[full2['Error_metric'] == 'R2'], hue = 'variable', ax = ax[1,2])
ax[1,2].set_xticks(range(2,21))
ax[1,2].set_title("R2")
ax[1,2].legend(loc='lower right')

## Part 2

### Cross Validation

In [None]:
cross_val_score()

In [None]:
from sklearn.model_selection import GridSearchCV # RandomSearchCV

# initialize model (no parameters)
neigh = KNeighborsRegressor()

# define grid search
neigh_search = GridSearchCV(estimator=neigh,
                            param_grid={"n_neighbors":range(2,21),
                                        "weights":["uniform", "distance"]},
                            scoring="r2",
                            cv=10) # K-Fold cross validation -> cv = K-Folds

# all possible metrics here:
# from sklearn.metrics import SCORERS
# sorted(SCORERS.keys())

In [None]:
neigh_search.fit(X_train, y_train)

In [None]:
neigh_search.cv_results_["mean_test_score"]

In [None]:
neigh_search.cv_results_["params"]

In [None]:
cv_res = neigh_search.cv_results_
sorted(zip(cv_res["mean_test_score"], cv_res["params"]), reverse=True)

#### Randomized search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_distributions = {"n_neighbors":range(2,21),
                       "weights":["distance", "uniform"]}
# 20 K values * 2 possible wights = 40 combinations
# 100 combinations

In [None]:
neigh_randsearch = RandomizedSearchCV(estimator=neigh,
                                      param_distributions=param_distributions,
                                      n_iter=40,
                                      scoring="r2",
                                      cv=10)

In [None]:
neigh_randsearch.fit(X_train, y_train)

In [None]:
cv_res = neigh_randsearch.cv_results_
sorted(zip(cv_res["mean_test_score"], cv_res["params"]), reverse=True)