In [None]:
'''# Lab | Comparing regression models


For this lab, we will be using the same dataset we used in the previous labs. We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs. 

### Instructions

1. In this final lab, we will model our data. Import sklearn `train_test_split` and separate the data.
2. Try a simple linear regression with all the data to see whether we are getting good results.
3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.
4. Use the function to check `LinearRegressor` and `KNeighborsRegressor`.
5. You can check also the `MLPRegressor` for this task!
6. Check and discuss the results.
'''

In [None]:
# Pasting below code from previous labs

In [85]:
import imblearn

# These are the normal libraries
import pandas as pd
import numpy as np

# This is just so that we don't get annoying warnings
import warnings
warnings.filterwarnings('ignore')

# This is the most common viz library in python
import matplotlib.pyplot as plt
%matplotlib inline

# This one is the above on steroids
import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# These Libs are for stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression



In [86]:
df = pd.read_csv(r"C:\Users\filip\OneDrive\Desktop\IRONHACK\Labs\Week5\lab-cleaning-categorical-data\files_for_lab\we_fn_use_c_marketing_customer_value_analysis.csv")

cols = []
for i in range(len(df.columns)):
    cols.append(df.columns[i].lower().replace(' ','_'))
df.columns = cols

df.dtypes # Checking data types

categorical_df = df.select_dtypes(include=['object']) #Saving categorical columns into df

categorical_df.isna().sum() #Checking for null values/No null values

# Checking unique values in each column
categorical_df['customer'].unique() #Categorical
categorical_df['state'].unique() #Categorical
categorical_df['response'].unique() #Categorical
categorical_df['coverage'].unique() #Categorical
categorical_df['education'].unique() #Categorical
categorical_df['effective_to_date'].unique() #Date Time
categorical_df['employmentstatus'].unique() #Categorical
categorical_df['gender'].unique() #Categorical
categorical_df['location_code'].unique() #Categorical
categorical_df['marital_status'].unique() #Categorical
categorical_df['policy_type'].unique() #Categorical
categorical_df['policy'].unique() #Categorical
categorical_df['renew_offer_type'].unique() #Categorical
categorical_df['sales_channel'].unique() #Categorical
categorical_df['vehicle_class'].unique() #Categorical
categorical_df['vehicle_size'].unique() #Categorical

# will drop effective_to_date and later if needed will convert to datetime on the original df

categorical_df = categorical_df.drop(['effective_to_date'], axis = 1)

# will also drop customer because it doesn't really give us any information

categorical_df = categorical_df.drop(['customer'], axis = 1)

categorical_df['policy_type'].unique() #Categorical

'''policy_type has the following unique values:['Corporate Auto', 'Personal Auto', 'Special Auto']

   policy has the following unique values:['Corporate L3', 'Personal L3', 'Corporate L2', 'Personal L1',
    'Special L2', 'Corporate L1', 'Personal L2', 'Special L1','Special L3']
    
It appears that the values in policy are the same as in policy_type but broken down into subtypes, more detailed information.

I would drop policy if in need to drop one of the two''' 

# Variables I would chose to hot encode
categorical_df['response'].unique()
categorical_df['coverage'].unique()
categorical_df['gender'].unique()
categorical_df['location_code'].unique()
categorical_df['marital_status'].unique()
categorical_df['policy_type'].unique()
categorical_df['vehicle_size'].unique()

# In my opinion these columns have too many values to hot encode, will drop for now
categorical_df['state'].unique()
categorical_df['education'].unique()
categorical_df['employmentstatus'].unique()
categorical_df['policy'].unique()
categorical_df['renew_offer_type'].unique()
categorical_df['sales_channel'].unique()
categorical_df['vehicle_class'].unique()

array(['Two-Door Car', 'Four-Door Car', 'SUV', 'Luxury SUV', 'Sports Car',
       'Luxury Car'], dtype=object)

In [87]:
customer_df = pd.read_csv(r"C:\Users\filip\OneDrive\Desktop\IRONHACK\Labs\Week5\lab-cleaning-numerical-data\files_for_lab\we_fn_use_c_marketing_customer_value_analysis.csv")
customer_df.head()
customer_df.shape
customer_df.dtypes

# Renaming columns
cols = []
for i in range(len(customer_df.columns)): 
    cols.append(customer_df.columns[i].lower().replace(' ', '_')) 
customer_df.columns = cols

# Changing effective to date column to datetime format

customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])

# Creating numerical data frame

numerical_df = customer_df.select_dtypes(exclude=['object'])

# Creating a function to differentiate betweens continuous and discrete variables

def decision(data, threshold=250):
    continuous_cols = []
    discrete_cols = []
    for column in data.columns:
        unique_count = data[column].nunique()
        if unique_count <= threshold:
            discrete_cols.append(column)
        else:
            continuous_cols.append(column)
    return continuous_cols, discrete_cols


continuous_columns, discrete_columns = decision(numerical_df)
print("Continuous columns:")
print(continuous_columns)
print("\nDiscrete columns:")
print(discrete_columns)
    
continuous_df = numerical_df[continuous_columns]
discrete_df = numerical_df[discrete_columns]


Continuous columns:
['customer_lifetime_value', 'income', 'total_claim_amount']

Discrete columns:
['effective_to_date', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies']


In [88]:
#Concating both dfs

df = pd.concat([numerical_df, categorical_df], axis = 1)

# Dropping outliers

def outliers_drop(data, columns, threshold=1.5):
    df_c = df.copy()
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        df_c = df_c[(df_c[column] >= lower_bound) & (df_c[column] <= upper_bound)]

    return df_c


clean_df = outliers_drop(df, ['total_claim_amount', 'income'], threshold=1.5).reset_index(drop = True)

# Copying df 

df_clean = clean_df.copy()

# Normalize continuous variables

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_continuous = scaler.fit_transform(continuous_df)
normalized_continuous = pd.DataFrame(normalized_continuous, columns=continuous_df.columns)

#Encode the categorical variables

cat_to_encode = categorical_df[['response', 'education', 'gender', 'state', 'marital_status', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class']]

cat_encoded = pd.get_dummies(cat_to_encode, drop_first=True)

# Leaving others as ordinal

cat_ordinal = categorical_df[['coverage', 'employmentstatus', 'location_code','vehicle_size']]


coverage_ordinal = {'Basic': 1, 'Extended': 2, 'Premium': 3}

cat_ordinal['coverage'] = cat_ordinal['coverage'].map(coverage_ordinal)



employmentstatus_ordinal = {'Employed': 1, 'Unemployed': 2, 'Medical Leave': 3, 'Disabled': 4, 'Retired': 5}

cat_ordinal['employmentstatus'] = cat_ordinal['employmentstatus'].map(employmentstatus_ordinal)



location_code_ordinal = {'Suburban': 1, 'Rural': 2, 'Urban': 3}

cat_ordinal['location_code'] = cat_ordinal['location_code'].map(location_code_ordinal)



vehicle_size_ordinal = {'Small': 1, 'Medsize': 2, 'Large': 3}

cat_ordinal['vehicle_size'] = cat_ordinal['vehicle_size'].map(vehicle_size_ordinal)

# converting to numeric

cat_ordinal['coverage'] = pd.to_numeric(cat_ordinal['coverage'])
cat_ordinal['employmentstatus'] = pd.to_numeric(cat_ordinal['employmentstatus'])
cat_ordinal['location_code'] = pd.to_numeric(cat_ordinal['location_code'])
cat_ordinal['vehicle_size'] = pd.to_numeric(cat_ordinal['vehicle_size'])

# The time variable can be useful. 
#Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

# effective_to_date already in date time

# Since the model will only accept numerical data, check and make sure that every column is numerical, 
#if some are not, change it using encoding.

# Will normalize discrete_df as well

discrete_df['effective_to_date'] = pd.to_datetime(discrete_df['effective_to_date']).apply(lambda x: x.timestamp())

scaler = MinMaxScaler()
discrete_df['effective_to_date'] = scaler.fit_transform(discrete_df[['effective_to_date']])


discrete_df = discrete_df.drop(['effective_to_date'], axis=1)
scaler = MinMaxScaler()
normalized_discrete = scaler.fit_transform(discrete_df)
normalized_discrete = pd.DataFrame(normalized_discrete, columns=discrete_df.columns)


# Will concat all the data

final_df = pd.concat([normalized_continuous, cat_ordinal, normalized_discrete, cat_encoded], axis = 1)

In [89]:
# End of code from previous labs

In [98]:
# 1. In this final lab, we will model our data. Import sklearn `train_test_split` and separate the data.

model_df = pd.concat([continuous_df, cat_ordinal, discrete_df, cat_encoded], axis = 1)

from sklearn.model_selection import train_test_split

X = model_df.drop('total_claim_amount', axis = 1) 
y = model_df['total_claim_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [99]:
# 2. Try a simple linear regression with all the data to see whether we are getting good results.

X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()

model.summary()


0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.626
Model:,OLS,Adj. R-squared:,0.624
Method:,Least Squares,F-statistic:,361.9
Date:,"Sun, 12 Nov 2023",Prob (F-statistic):,0.0
Time:,13:42:42,Log-Likelihood:,12523.0
No. Observations:,9134,AIC:,-24960.0
Df Residuals:,9091,BIC:,-24650.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0572,0.007,7.957,0.000,0.043,0.071
customer_lifetime_value,-8.085e-08,1.03e-07,-0.786,0.432,-2.82e-07,1.21e-07
income,-5.936e-07,2.76e-08,-21.491,0.000,-6.48e-07,-5.39e-07
coverage,-0.0047,0.002,-1.989,0.047,-0.009,-6.79e-05
employmentstatus,0.0020,0.001,2.516,0.012,0.000,0.004
location_code,-0.0365,0.001,-38.112,0.000,-0.038,-0.035
vehicle_size,-0.0066,0.001,-5.428,0.000,-0.009,-0.004
monthly_premium_auto,0.0019,9.36e-05,20.004,0.000,0.002,0.002
months_since_last_claim,-2.047e-06,6.42e-05,-0.032,0.975,-0.000,0.000

0,1,2,3
Omnibus:,1262.421,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9975.145
Skew:,0.419,Prob(JB):,0.0
Kurtosis:,8.051,Cond. No.,1.27e+16


In [100]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, Y_train)
lm.score(X_train, Y_train)

0.6314366885272034

In [101]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, Y_train)
lm.score(X_train, Y_train)

predictions = lm.predict(X_test)

print("R2 value is = ",round(r2_score(Y_test, predictions),2))
print("The intercept of the model is = ",lm.intercept_)
print("The coefficients of the model are = ",lm.coef_)

mse = mean_squared_error(Y_test, predictions)
mae = mean_absolute_error(Y_test, predictions)
print("The mse of the model is = ", round(mse,2))
print("The root mse of the model is = ",round(np.sqrt(mse),2))
print("The mean absolute error of the model is = ",round(mae,2))

R2 value is =  0.6
The intercept of the model is =  0.0600495268418009
The coefficients of the model are =  [-2.50175776e-08 -6.04641762e-07 -5.17031526e-03  2.00604622e-03
 -3.62234889e-02 -5.42968942e-03  1.88014254e-03  6.20439551e-07
 -5.20221648e-06 -3.29649010e-04  1.83556225e-04 -5.02175513e-03
 -2.60980681e-03 -1.48844863e-02  7.60523058e-03 -1.25159937e-02
  7.46117720e-03  7.88284772e-04  5.20288099e-04 -1.27995037e-03
  6.11982978e-04 -2.22973706e-03  3.00577420e-02 -1.42659094e-03
  3.90294461e-03 -2.40833539e-03 -5.06370979e-03  1.05392026e-03
 -1.42149961e-03 -1.05901160e-03  8.43972424e-03  1.39611928e-03
 -5.93289891e-03  8.31955891e-03  2.31358881e-03 -3.37687692e-04
 -5.91048594e-04 -1.98381015e-03  7.78273289e-04  1.21544676e-02
 -1.36537137e-02 -5.07516297e-03 -1.59914716e-02  2.61849941e-03]
The mse of the model is =  0.0
The root mse of the model is =  0.06
The mean absolute error of the model is =  0.04


In [94]:
# 3. Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

def test_regression_model(X, Y):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
    
    models = {
        'Linear Regression': LinearRegression(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'MLPRegressor': MLPRegressor()
    }
    
    results = {}
    
    for model_name, model in models.items():
        model.fit(X_train, Y_train)
        
        
        predictions = model.predict(X_test)
        
        r2 = r2_score(Y_test, predictions)
        mse = mean_squared_error(Y_test, predictions)
        mae = mean_absolute_error(Y_test, predictions)
        
        results[model_name] = {
            'R2': round(r2, 2),
            'Root MSE': round(np.sqrt(mse),2),
            'MAE': round(mae, 2)
        }
    
    return models

In [95]:
models = test_regression_model(X, Y)

for model, metrics in results.items():
    print(f"Model: {model}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print() 

Model: Linear Regression
R2: 0.6
Root MSE: 0.06
MAE: 0.04

Model: KNeighborsRegressor
R2: 0.56
Root MSE: 0.06
MAE: 0.04

Model: MLPRegressor
R2: 0.66
Root MSE: 0.06
MAE: 0.04

