In [1]:
#importing libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

#importing file from csv
mkt = pd.read_csv(r"C:\Users\pedro\Desktop\Ironhack\lab-customer-analysis-round-6\lab-customer-analysis-round-6\files_for_lab\csv_files\marketing_customer_analysis.csv")

#standardizing columns names
cols = []
for i in range(len(mkt.columns)):
    cols.append(mkt.columns[i].lower().replace(' ','_'))

mkt.columns = cols

#creating dataframes for numerical and categorical variables
mkt_numerical = mkt.select_dtypes(include=[np.number])
mkt_categoricals = mkt.select_dtypes(['object'])

In [2]:
#One Hot Label Encoding 
dummy_data = pd.get_dummies(mkt_categoricals.drop(['customer','effective_to_date'], axis=1), drop_first=True)
dummy_data.head(3)

Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,response_Yes,coverage_Extended,coverage_Premium,education_College,education_Doctor,education_High School or Below,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [3]:
#Concatenating DataFrames
mkt_data = pd.concat([mkt_numerical,dummy_data],axis=1)
mkt_data.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,state_California,state_Nevada,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
0,2763.519279,56274,69,32,5,0,1,384.811147,0,0,...,0,0,0,0,0,0,0,1,1,0
1,6979.535903,0,94,13,42,0,8,1131.464935,0,0,...,0,0,0,0,0,0,0,0,1,0
2,12887.43165,48767,108,18,38,0,2,566.472247,0,1,...,0,0,0,0,0,0,0,1,1,0


In [4]:
# Train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mkt_data.drop(['total_claim_amount'], axis=1),mkt_data['total_claim_amount'],test_size=0.2)

print(X_train.shape,X_test.shape)

(7307, 50) (1827, 50)


In [5]:
#applying linear regression model
# in lab4, we found the two variables more correlated with 'total claim amount' were 'income' and 'monthly_premium_auto'
# let's use these two variables as base model for the linear regression

Y = y_train
X = X_train[['income','monthly_premium_auto']]
X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()

model.summary()

0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.522
Model:,OLS,Adj. R-squared:,0.521
Method:,Least Squares,F-statistic:,3980.0
Date:,"Sun, 29 Oct 2023",Prob (F-statistic):,0.0
Time:,23:58:14,Log-Likelihood:,-49077.0
No. Observations:,7307,AIC:,98160.0
Df Residuals:,7304,BIC:,98180.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,60.6191,7.404,8.187,0.000,46.105,75.133
income,-0.0032,7.73e-05,-42.020,0.000,-0.003,-0.003
monthly_premium_auto,5.2915,0.068,77.727,0.000,5.158,5.425

0,1,2,3
Omnibus:,716.467,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4146.956
Skew:,0.284,Prob(JB):,0.0
Kurtosis:,6.647,Cond. No.,153000.0


In [6]:
#model validation: R2,MSE,RMSE and MAE

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

Y = y_train
X =  X_train[['income','monthly_premium_auto']]

lm = linear_model.LinearRegression()
model = lm.fit(X,Y)
lm.score(X,Y) # This is the R^2

predictions = lm.predict(X)

print("R2 value is = ",round(r2_score(Y, predictions),2))
print("The intercept of the model is = ",round(lm.intercept_,3))

mse = round(mean_squared_error(Y, predictions),3)
mae = round(mean_absolute_error(Y, predictions),3)

print("The mse of the model is = ",mse)
print("The root mse of the model is = ",round(np.sqrt(mse),3))
print("The mean absolute error of the model is = ",mae)


R2 value is =  0.52
The intercept of the model is =  60.619
The mse of the model is =  39936.143
The root mse of the model is =  199.84
The mean absolute error of the model is =  143.279
