In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

pd.set_option('display.max_columns', None)
from helper_functions import separate_continous_discrete, print_correlation_matrix, plot_discrete_variables, plot_continuous_variables, plot_outliers, remove_outlier_iqr, remove_outlier_cutoff_between

## Instructions
1. Concatenate Numerical and Categorical dataframes into one dataframe called data. Split into X=features y=target (total_claim_amount).

In [2]:
categorical_df = pd.read_csv('categorical.csv')
numerical_df = pd.read_csv('numerical.csv')
display(categorical_df.shape)
display(numerical_df.shape)

(8290, 18)

(8290, 8)

2. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

In [3]:
data = pd.concat([categorical_df, numerical_df], axis=1)
data.dtypes

customer                          object
state                             object
response                           int64
coverage                          object
education                         object
effective_to_date                 object
employmentstatus                  object
gender                            object
location_code                     object
marital_status                    object
policy_type                       object
policy                             int64
renew_offer_type                   int64
sales_channel                     object
vehicle_class                     object
vehicle_size                      object
effective_to_month_in_2011         int64
CW                                 int64
customer_lifetime_value          float64
income                             int64
monthly_premium_auto               int64
months_since_last_claim            int64
months_since_policy_inception      int64
number_of_open_complaints          int64
number_of_polici

In [4]:
data.drop(['effective_to_date', 'customer'], axis=1, inplace=True)
data

Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size,effective_to_month_in_2011,CW,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,Washington,0,Basic,Bachelor,Employed,F,Suburban,Married,Corporate,3,1,Agent,Two-Door Car,Midsize,2,8,2763.519279,56274,69,32,5,0,1,384.811147
1,Arizona,0,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal,3,3,Agent,Four-Door Car,Midsize,1,5,6979.535903,0,94,13,42,0,8,1131.464935
2,Nevada,0,Premium,Bachelor,Employed,F,Suburban,Married,Personal,3,1,Agent,Two-Door Car,Midsize,2,7,12887.431650,48767,108,18,38,0,2,566.472247
3,California,0,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate,2,1,Call Center,SUV,Midsize,1,3,7645.861827,0,106,18,65,0,7,529.881344
4,Washington,0,Basic,Bachelor,Employed,M,Rural,Single,Personal,1,1,Agent,Four-Door Car,Midsize,2,5,2813.692575,43836,73,12,44,0,1,138.130879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8285,California,0,Premium,College,Employed,F,Suburban,Single,Personal,2,1,Branch,Four-Door Car,Large,1,1,4100.398533,47761,104,16,58,0,1,541.282007
8286,California,1,Extended,College,Employed,F,Suburban,Divorced,Corporate,3,1,Branch,Four-Door Car,Midsize,2,6,3096.511217,21604,79,14,28,0,1,379.200000
8287,California,0,Extended,Bachelor,Unemployed,M,Suburban,Single,Corporate,2,1,Branch,Four-Door Car,Midsize,2,5,8163.890428,0,85,9,37,3,2,790.784983
8288,California,0,Extended,College,Employed,M,Suburban,Married,Personal,2,3,Branch,Four-Door Car,Large,2,5,7524.442436,21941,96,34,3,0,3,691.200000


In [5]:
# import at the top of the file
X = data.drop(columns=['total_claim_amount'])  
y = data['total_claim_amount']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

3. Separate X_train and X_test into numerical and categorical (X_train_cat , X_train_num , X_test_cat , X_test_num)

In [6]:
X_train_cat = X_train.select_dtypes(include=['object'])
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_test_cat = X_test.select_dtypes(include=['object'])
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

4. Use X_train_num to fit scalers. Transform BOTH X_train_num and X_test_num.

In [7]:
scaler = MinMaxScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

5. Encode the categorical variables X_train_cat and X_test_cat (See the hint below for encoding categorical data!!!)

 Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [8]:
# ***********************************************
# Hint for Categorical Variables
# You should deal with the categorical variables as shown below (for ordinal encoding, dummy code has been provided as well):

# Encoder Type	Column
# One hot	state
# Ordinal	coverage
# Ordinal	employmentstatus
# Ordinal	state
# One hot	marital status
# One hot	policy type
# One hot	policy
# One hot	renew_offer_type
# One hot	sales channel
# One hot	vehicle class
# Ordinal	vehicle size
# Ordinal	education
# One hot	response
# One hot	gender
# Dummy code
# data["coverage"] = data["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

# given that column "coverage" in the dataframe "data" has three categories:

# "basic", "extended", and "premium" values are to be represented in the same order.

# ******************************************************

In [9]:
# define hot and ordinal columns
hot_columns = ['state', 'marital_status', 'policy_type', 'sales_channel', 'vehicle_class', 'gender']
ordinal_columns = ['coverage', 'employmentstatus', 'state', 'vehicle_size', 'education', 'location_code']

In [10]:
# ordinal encode all columns that are in ordinal_columns
ordinal_encoder = OrdinalEncoder()
X_train_cat_ordinal = ordinal_encoder.fit_transform(X_train_cat[ordinal_columns])
X_test_cat_ordinal = ordinal_encoder.transform(X_test_cat[ordinal_columns])

# one hot encode all columns that are in hot_columns
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_cat_onehot = onehot_encoder.fit_transform(X_train_cat[hot_columns])
X_test_cat_onehot = onehot_encoder.transform(X_test_cat[hot_columns])



In [11]:
# concatenate
X_train = np.concatenate([X_train_cat_ordinal, X_train_cat_onehot], axis=1)
X_test = np.concatenate([X_test_cat_ordinal, X_test_cat_onehot], axis=1)

In [12]:
# concate the ordinals with numerical
X_train = np.concatenate([X_train, X_train_num_scaled], axis=1)
X_test = np.concatenate([X_test, X_test_num_scaled], axis=1)

6. Try a simple linear regression with all the data to see whether we are getting good results.

In [13]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(X_train, y_train)

In [14]:
from sklearn.metrics import r2_score
predictions = LR.predict(X_test)
r2_score(y_test, predictions)

0.5676975336772677

7. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.
Input- list of models, Output- list os results 

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# get's R2 scores for different models 
def train_and_test_models(models, X_train, X_test, y_train, y_test):
    results = []
    
    for model in models:
        model_name = type(model).__name__
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        results.append((model_name, train_score, test_score, mse, mae))
        
    return pd.DataFrame(results, columns=['Model', 'Train Score', 'Test Score', 'Mean Squared Error', 'Mean Absolute Error'])

8. Use the function to check LinearRegressor and KNeighborsRegressor.

In [16]:
models = [LinearRegression(), KNeighborsRegressor()]
results = train_and_test_models(models, X_train, X_test, y_train, y_test)
results.set_index('Model')

Unnamed: 0_level_0,Train Score,Test Score,Mean Squared Error,Mean Absolute Error
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearRegression,0.586756,0.567698,31921.854369,130.319157
KNeighborsRegressor,0.68595,0.505503,36514.370186,123.489978


9. You can check also the MLPRegressor for this task!
> no thanks :)

10. Check and discuss the results.

- Linear Regression has a slightly lower test score but a lower MSE and a slightly higher MAE compared to K Neighbors Regressor.
- K Neighbors Regressor has a higher test score but a higher MSE and a slightly lower MAE compared to Linear Regression.

- none is objectively better, depends on use case