In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor


# ----------------------------------- Notebook from last lab -----------------------------------

# Importing numerical data
df_numerical = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-cleaning-numerical-data\numerical.csv")

# Importing categorical data
df_categ = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-cleaning-categorical-data\categorical.csv")

# Concatenating numerical and categorical dataframes
df_concat = pd.concat([df_numerical, df_categ], axis = 1)

# Removing outliers
for col in df_numerical.columns:
    Q1 = df_concat[col].quantile(0.25)
    Q3 = df_concat[col].quantile(0.75)
    IQR = Q3-Q1
    df_concat = df_concat[(df_concat[col] >= Q1-1.5*IQR) & (df_concat[col] <= Q3+1.5*IQR)]   
    
df_concat = df_concat.reset_index(drop = True) 

# Creating a copy of the dataframe for the data wrangling
df_wrangling = df_concat

# Normalizing the continuous variables
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df_numerical)
normalized_data = pd.DataFrame(normalized_data, columns = df_numerical.columns)

for col in df_numerical.columns:
    if df_numerical[col].dtype == "float":
        df_wrangling[col] = normalized_data[col]
        
# Encoding the categorical variables
df_wrangling["coverage"] = df_wrangling["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
df_wrangling["education"] = df_wrangling["education"].map({"High School or Below" : 0, "Bachelor" : 1, "College" : 2, "Master" : 3, "Doctor" : 4})
df_wrangling["location_code"] = df_wrangling["location_code"].map({"Rural" : 0, "Suburban" : 1, "Urban" : 2})
df_wrangling["renew_offer_type"] = df_wrangling["renew_offer_type"].map({"Offer1" : 0, "Offer2" : 1, "Offer3" : 2, "Offer4" : 3})
df_wrangling["vehicle_size"] = df_wrangling["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})

df_categ_no_outliers = df_wrangling.select_dtypes(include = object)
dummies = pd.get_dummies(df_categ_no_outliers, drop_first = False)

df_numerical_no_outliers = df_wrangling.select_dtypes(include = np.number)
df_wrangling = pd.concat([df_numerical_no_outliers, dummies], axis = 1)

df_wrangling

# ---------------------------------------------------------------------------------------------------------

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,coverage,education,...,policy_Special L2,policy_Special L3,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0.010629,0.562847,69,32,5,0,1,0.132974,0,1,...,0,0,1,0,0,0,0,0,0,1
1,0.062406,0.000000,108,18,38,0,2,0.391051,2,1,...,0,0,1,0,0,0,0,0,0,1
2,0.134960,0.487763,106,18,65,0,7,0.195764,0,1,...,0,0,0,0,1,0,0,1,0,0
3,0.070589,0.000000,73,12,44,0,1,0.183117,0,1,...,0,0,1,0,0,0,1,0,0,0
4,0.011245,0.438443,69,14,94,0,2,0.047710,0,1,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5883,0.041153,0.000000,64,6,48,0,3,0.074867,0,2,...,0,0,0,0,1,0,0,0,0,1
5884,0.111893,0.516528,104,16,58,0,1,0.469490,2,2,...,0,0,0,1,0,0,1,0,0,0
5885,0.151914,0.171603,79,14,28,0,1,0.266371,1,2,...,0,0,0,1,0,0,1,0,0,0
5886,0.042095,0.000000,96,34,3,0,3,0.171570,1,2,...,0,0,0,1,0,0,1,0,0,0


In [2]:
# 2) Trying a simple linear regression with all the data

# X-y split
X = df_wrangling.drop("total_claim_amount", axis=1)
y = df_wrangling["total_claim_amount"]

# lm model
lm = linear_model.LinearRegression()
model = lm.fit(X, y)
y_predictions = lm.predict(X)

# Calculating errors
r2 = r2_score(y, y_predictions)
mse = mean_squared_error(y, y_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_predictions)

print("R2 of y predictions is =", round(r2, 4))
print("The mean squared error of the model is =", round(mse, 4))
print("The root mean squared error of the model is =", round(rmse, 4))
print("The mean absolute error of the model is =", round(mae, 4))

R2 of y predictions is = 0.1857
The mean squared error of the model is = 0.0081
The root mean squared error of the model is = 0.0901
The mean absolute error of the model is = 0.0649


In [3]:
# 3) Defining a function that takes a list of models and train (and tests) them

def L_model(data_frame, target_variable):
    
    # X-y split
    X = data_frame.drop(target_variable, axis=1)
    y = data_frame[target_variable]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

    # lm model
    lm = linear_model.LinearRegression()
    model = lm.fit(X_train, y_train)
    y_predictions = model.predict(X_test)

    # Calculating errors
    r2 = r2_score(y_test, y_predictions)
    mse = mean_squared_error(y_test, y_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_predictions)

    print("R2 of y_test predictions is =", round(r2, 4), "(LinearRegression)")
    print("The mean squared error of the model is =", round(mse, 4), "(LinearRegressior)")
    print("The root mean squared error of the model is =", round(rmse, 4), "(LinearRegressior)")
    print("The mean absolute error of the model is =", round(mae, 4), "(LinearRegressior)")



def KNN_model(data_frame, target_variable, k_neighbors):

    # X-y split
    X = data_frame.drop(target_variable, axis=1)
    y = data_frame[target_variable]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    # KNN model
    KNN_model = KNeighborsRegressor(n_neighbors = k_neighbors)
    model = KNN_model.fit(X_train, y_train)
    y_predictions = model.predict(X_test)
    
    # Calculating errors
    r2 = r2_score(y_test, y_predictions)
    mse = mean_squared_error(y_test, y_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_predictions)

    print("R2 of y_test predictions is =", round(r2, 4), "(KNeighborsRegression)")
    print("The mean squared error of the model is =", round(mse, 4), "(KNeighborsRegression)")
    print("The root mean squared error of the model is =", round(rmse, 4), "(KNeighborsRegression)")
    print("The mean absolute error of the model is =", round(mae, 4), "(KNeighborsRegression)")

In [4]:
# 4) Using the function to check LinearRegressor and KNeighborsRegressor
L_model(df_wrangling, "total_claim_amount")
print("------------------------------------------------------------------------------")
KNN_model(df_wrangling, "total_claim_amount", 10)

R2 of y_test predictions is = 0.152 (LinearRegression)
The mean squared error of the model is = 0.0071 (LinearRegressior)
The root mean squared error of the model is = 0.0845 (LinearRegressior)
The mean absolute error of the model is = 0.0639 (LinearRegressior)
------------------------------------------------------------------------------
R2 of y_test predictions is = -0.0722 (KNeighborsRegression)
The mean squared error of the model is = 0.009 (KNeighborsRegression)
The root mean squared error of the model is = 0.095 (KNeighborsRegression)
The mean absolute error of the model is = 0.0723 (KNeighborsRegression)


In [5]:
# 5) "You can check also the MLPRegressor for this task"

# I searched the model and I think I could applied it, but we haven't yet coded this type of model during classes.
# I don't know how to choose the number of hidden layers and their meaning. I'll wait for Gonçalo to teach this in depth.

In [6]:
# 6) Checking and discussing the results

# Obviously the best result was obtained when I evaluated the model on the data that was trained (with all the data).
# The Linear Regression model got the best results. Still the results were very bad. 
# In KNeighborsRegression the R2 was negative, which means that the chosen model doesn't follow the trend of the data.
# A possible reason for that is the huge amount of columns created with the "One hot encoding" process.

# I'm gonna try those models in df_numerical and check if I can get better results:
L_model(df_numerical, "total_claim_amount")
print("------------------------------------------------------------------------------")
KNN_model(df_numerical, "total_claim_amount", 10)

R2 of y_test predictions is = 0.5013 (LinearRegression)
The mean squared error of the model is = 42358.1762 (LinearRegressior)
The root mean squared error of the model is = 205.811 (LinearRegressior)
The mean absolute error of the model is = 146.5861 (LinearRegressior)
------------------------------------------------------------------------------
R2 of y_test predictions is = 0.221 (KNeighborsRegression)
The mean squared error of the model is = 66162.2173 (KNeighborsRegression)
The root mean squared error of the model is = 257.2202 (KNeighborsRegression)
The mean absolute error of the model is = 180.4457 (KNeighborsRegression)


In [7]:
# Looking for the max R2 using KNeighborsRegression in df_numerical

r2_max = -9999
for k_neighbors in range(1,101):
    
    # X-y split
    X = df_numerical.drop("total_claim_amount", axis=1)
    y = df_numerical["total_claim_amount"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    # KNN model
    KNN_model = KNeighborsRegressor(n_neighbors = k_neighbors)
    model = KNN_model.fit(X_train, y_train)
    y_predictions = model.predict(X_test)
    
    # Calculating errors
    r2 = r2_score(y_test, y_predictions)
    
    if r2 > r2_max:
        r2_max = r2
        k = k_neighbors

print("The best R2 of y_test predictions using KNeighborsRegression is =", round(r2_max, 4))
print("The correspondent kk_neighbors is =", k)

The best R2 of y_test predictions using KNeighborsRegression is = 0.2377
The correspondent kk_neighbors is = 24


In [8]:
# The KNN results improved, but they are still far from good.
# This data needs to be further wrangled and cleaned more carefully to obtain better results.