# Lab | Comparing regression models


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [3]:
customer_df  = pd.read_csv(r'./machine-learning/lab-cleaning-numerical-data/files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')

In [5]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    return df[(df[column] >= lower_limit) & (df[column] <= upper_limit)]

numerical_cols = customer_df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    customer_df = remove_outliers(customer_df, col)


In [6]:
customer_df.shape

(5888, 24)

In [7]:
df_wrangling = customer_df.copy()

In [8]:
df_wrangling.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
5,OC83172,Oregon,8256.2978,Yes,Basic,Bachelor,1/25/11,Employed,F,62902,...,94,0,2,Personal Auto,Personal L3,Offer2,Web,159.383042,Two-Door Car,Medsize


In [9]:
from sklearn.preprocessing import Normalizer

# Select only numeric columns
numeric_columns = df_wrangling.select_dtypes(include=['int64', 'float64']).columns

# Instantiate the Normalizer
norm = Normalizer()

# Apply normalization to numeric columns
df_wrangling[numeric_columns] = norm.fit_transform(df_wrangling[numeric_columns])


In [10]:
# One hot to state
# Ordinal to coverage
# Ordinal to employmentstatus
# Ordinal to location code
# One hot to marital status
# One hot to policy type
# One hot to policy
# One hot to renew offercustomer_df
# One hot to sales channel
# One hot vehicle class
# Ordinal vehicle size


df_wrangling.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount',
       'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [11]:
categoricals = customer_df.select_dtypes('object')

In [12]:
categoricals = categoricals.drop(['Customer', 'Effective To Date'], axis =1)

In [13]:
categoricals.columns

Index(['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [14]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Select columns for ordinal encoding
ordinal_cols = ['Coverage', 'EmploymentStatus', 'Location Code', 'Vehicle Size']

# Instantiate the OrdinalEncoder
ordinal_enc = OrdinalEncoder()

# Apply ordinal encoding to selected columns
df_wrangling[ordinal_cols] = ordinal_enc.fit_transform(df_wrangling[ordinal_cols])


# Select columns for one-hot encoding
one_hot_cols = ['State', 'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type', 'Sales Channel', 'Vehicle Class']

# Apply one-hot encoding to selected columns
df_wrangling = pd.get_dummies(df_wrangling, columns=one_hot_cols)


In [15]:
# Convert your 'Effective To Date' column to datetime if not already done
df_wrangling['Effective To Date'] = pd.to_datetime(df_wrangling['Effective To Date'], format='%m/%d/%y')

# Create 'Year' column
df_wrangling['Year'] = df_wrangling['Effective To Date'].dt.year

# Create 'Month' column
df_wrangling['Month'] = df_wrangling['Effective To Date'].dt.month_name()

# Create 'Day of Week' column
df_wrangling['Day of Week'] = df_wrangling['Effective To Date'].dt.day_name()

In [16]:
df_wrangling.columns

Index(['Customer', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount',
       'Vehicle Size', 'State_Arizona', 'State_California', 'State_Nevada',
       'State_Oregon', 'State_Washington', 'Marital Status_Divorced',
       'Marital Status_Married', 'Marital Status_Single',
       'Policy Type_Corporate Auto', 'Policy Type_Personal Auto',
       'Policy Type_Special Auto', 'Policy_Corporate L1',
       'Policy_Corporate L2', 'Policy_Corporate L3', 'Policy_Personal L1',
       'Policy_Personal L2', 'Policy_Personal L3', 'Policy_Special L1',
       'Policy_Special L2', 'Policy_Special L3', 'Renew Offer Type_Offer1',
       'Renew Offer Type_Offer2', 'Renew Offer Type_Offer3',
       'Renew Offer Type_Offer4', 'Sales C

In [17]:
df_wrangling.dtypes

Customer                                 object
Customer Lifetime Value                 float64
Response                                 object
Coverage                                float64
Education                                object
Effective To Date                datetime64[ns]
EmploymentStatus                        float64
Gender                                   object
Income                                  float64
Location Code                           float64
Monthly Premium Auto                    float64
Months Since Last Claim                 float64
Months Since Policy Inception           float64
Number of Open Complaints               float64
Number of Policies                      float64
Total Claim Amount                      float64
Vehicle Size                            float64
State_Arizona                             uint8
State_California                          uint8
State_Nevada                              uint8
State_Oregon                            

In [18]:
df_wrangling=df_wrangling.drop(['Customer','Response','Education','Effective To Date','Gender'], axis=1)

In [19]:
df_wrangling = pd.get_dummies(df_wrangling, columns=['Month','Day of Week'])

In [20]:
df_wrangling.dtypes

Customer Lifetime Value          float64
Coverage                         float64
EmploymentStatus                 float64
Income                           float64
Location Code                    float64
Monthly Premium Auto             float64
Months Since Last Claim          float64
Months Since Policy Inception    float64
Number of Open Complaints        float64
Number of Policies               float64
Total Claim Amount               float64
Vehicle Size                     float64
State_Arizona                      uint8
State_California                   uint8
State_Nevada                       uint8
State_Oregon                       uint8
State_Washington                   uint8
Marital Status_Divorced            uint8
Marital Status_Married             uint8
Marital Status_Single              uint8
Policy Type_Corporate Auto         uint8
Policy Type_Personal Auto          uint8
Policy Type_Special Auto           uint8
Policy_Corporate L1                uint8
Policy_Corporate

### 1. In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

In [21]:
from sklearn.model_selection import train_test_split

### 2. Try a simple linear regression with all the data to see whether we are getting good results.

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the independent variables (features) and the dependent variable (target)

X = df_wrangling.drop('Total Claim Amount', axis=1)
Y = df_wrangling['Total Claim Amount']

# Now we split our data into training set and testing set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, Y_train)

# Now your model is ready to make predictions
predictions = model.predict(X_test)


# Calculate metrics
mse = mean_squared_error(Y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, predictions)

print(f"R2: {r2}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")


R2: 0.8365782867072896
MSE: 0.00047319888544063865
RMSE: 0.021753135071539427


### 3~5. Define a function that takes a list of models and trains (LinearRegressor, KneighborsRegressor, MLPRegressor)

In [24]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

def train_and_test(models, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    model_results = {}
    for model in models:
        model_name = type(model).__name__
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)

        mse = mean_squared_error(Y_test, predictions)
        r2 = r2_score(Y_test, predictions)

        model_results[model_name] = {'Mean Squared Error': mse, 'R^2 Score': r2}
    return model_results


models = [LinearRegression(), KNeighborsRegressor(),MLPRegressor(max_iter=500)]


X = df_wrangling.drop('Total Claim Amount', axis=1)
Y = df_wrangling['Total Claim Amount']


results = train_and_test(models, X, Y)
print(results)



{'LinearRegression': {'Mean Squared Error': 0.00047319888544063865, 'R^2 Score': 0.8365782867072896}, 'KNeighborsRegressor': {'Mean Squared Error': 0.0008153437127456081, 'R^2 Score': 0.7184167787393448}, 'MLPRegressor': {'Mean Squared Error': 0.004101370250571929, 'R^2 Score': -0.4164296954589901}}


### 6. Check and discuss the results.

Based on these results, it seems that the Linear Regression model performs the best among the three models on your dataset, given the lower MSE and higher R^2 score. The MLP Regressor seems to perform poorly with the current settings, which suggests it may need more tuning or your problem might not be suitable for it.