In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


In [2]:
customer_df = pd.read_csv("/Users/giulianamiranda/Documents/Labs/lab-comparing-regression-models/cleaned_data.csv", sep = ',')
customer_df

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,total_claim_amount,number_of_open_complaints,number_of_policies,response,coverage,...,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,day_of_week,month,gender_F,gender_M
0,2763.519279,56274,69,32,5,384.811147,0,1,0,0,...,0,0,0,0,0,1,3,2,1,0
1,6979.535903,0,94,13,42,1131.464935,0,8,0,1,...,1,0,0,0,0,0,0,1,1,0
2,12887.431650,48767,108,18,38,566.472247,0,2,0,2,...,0,0,0,0,0,1,5,2,1,0
3,7645.861827,0,106,18,65,529.881344,0,7,0,0,...,0,0,0,1,0,0,3,1,0,1
4,2813.692575,43836,73,12,44,138.130879,0,1,0,0,...,1,0,0,0,0,0,3,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8312,4100.398533,47761,104,16,58,541.282007,0,1,0,2,...,1,0,0,0,0,0,3,1,1,0
8313,3096.511217,21604,79,14,28,379.200000,0,1,1,1,...,1,0,0,0,0,0,5,2,1,0
8314,8163.890428,0,85,9,37,790.784983,3,2,0,1,...,1,0,0,0,0,0,6,2,0,1
8315,7524.442436,21941,96,34,3,691.200000,0,3,0,1,...,1,0,0,0,0,0,3,2,0,1


In [3]:
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])
customer_df['day_of_week'] = customer_df['effective_to_date'].dt.dayofweek
customer_df['month'] = customer_df['effective_to_date'].dt.month


# I'm dropping the effective_to_date because I've extracted the month and the day of the week
customer_df = customer_df.drop(['effective_to_date'], axis=1)

In [4]:
# In this final lab, we will model our data. Import sklearn train_test_split and separate the data.


from sklearn.model_selection import train_test_split


X = customer_df.drop('total_claim_amount', axis=1)  
y = customer_df['total_claim_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
model.score(X,y)

predictions = lm.predict(X_test) 
    
print("R2 value is = ", round(r2_score(y_test, predictions), 2))
print("The intercept of the model is = ", lm.intercept_)
print("The coefficients of the models are: ", lm.coef_)

#### The R2 is very low, of only 0.6. It is not a very good model

R2 value is =  0.6
The intercept of the model is =  -4.054942568863453
The coefficients of the models are:  [-1.29265452e-03 -1.65052098e-03  5.69692823e+00 -3.51695532e-02
 -8.75741396e-02  8.26821831e-02  8.25219759e-01  1.96927612e+01
 -9.23416225e+00 -1.92094304e+01 -2.46137566e+01  1.14482327e+02
 -3.74444280e+01  2.17694776e+00  6.75924973e-01 -1.45281298e+00
 -4.49067841e+00  3.09061866e+00 -1.85207443e+01 -3.55670438e+01
  5.40877882e+01 -3.51739747e+00 -4.11345772e-01  3.92874325e+00
 -4.57952516e+00 -7.74487812e-01  1.83661549e+00  4.88936580e+00
 -3.19218223e+00 -2.10852935e+00  3.49181590e+01 -1.87633976e+01
 -1.22260182e+01 -3.51739747e+00 -4.11345772e-01  3.92874325e+00
 -1.20191869e+01  1.09230424e+01 -3.06515420e+00  4.16129871e+00
 -8.62257506e-01  1.87759729e+00  1.50234771e-01 -1.16557455e+00
  1.04094484e+01  3.07878427e+01 -5.65554945e+00 -1.83362664e+01
 -3.21253387e+01  1.49198634e+01  5.08618555e-01  3.51324078e+00
 -1.62907743e+01  1.62907743e+01]


In [6]:
# Now define a function that takes a list of models and train (and tests) them so we can try a lot of them 
# without repeating code.
# Use the function to check LinearRegressor and KNeighborsRegressor.
# You can check also the MLPRegressor for this task!



def train_test_models(X,y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        'Linear Regression': LinearRegression(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'MLPRegressor': MLPRegressor()
    }
    
   
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
     
        print(f"Model: {type(model).__name__}")
        print(f"R2 Score: {r2:.2f}")
        print(f"Mean Squared Error: {mse:.2f}")
        print(f"Mean Absolute Error: {mae:.2f}")
        print("="*50)


train_test_models(X,y)   

Model: LinearRegression
R2 Score: 0.59
Mean Squared Error: 28446.26
Mean Absolute Error: 126.81
Model: KNeighborsRegressor
R2 Score: 0.20
Mean Squared Error: 55324.68
Mean Absolute Error: 166.58
Model: MLPRegressor
R2 Score: 0.56
Mean Squared Error: 30474.10
Mean Absolute Error: 126.57


In [7]:
# Check and discuss the results

# The Linear Regression model had the best result of the 3, but the R2 of 0.59 shows it 
# has a lot of room for improvement

# The MLPRegressor was slighty worse, with R2 of 0.54. It probably could be improved as well

# KNeighborsRegressor was outperformed by the other two and had a poor outcome
