In [81]:
#importing libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

import math

#importing file from csv
mkt = pd.read_csv(r"C:\Users\pedro\Desktop\Ironhack\lab-customer-analysis-round-7\lab-customer-analysis-round-7\files_for_lab\csv_files\marketing_customer_analysis.csv")


In [82]:
#function cleaning and processing data

def clean_and_process(df):
    cols = []
    for i in range(len(df.columns)):
        cols.append(df.columns[i].lower().replace(' ','_'))
    mkt.columns = cols    
    #creating dataframes for numerical and categorical variables
    df_numerical = df.select_dtypes(include=[np.number])
    df_categoricals = df.select_dtypes(['object'])
    #normalizing data
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(df_numerical)
    normalized_data = pd.DataFrame(normalized_data, columns=df_numerical.columns)
    dummy_data = pd.get_dummies(df_categoricals.drop(['customer','effective_to_date'], axis=1), drop_first=True)
    #Concating DataFrames
    df_data = pd.concat([df_numerical,dummy_data],axis=1)
    return df_data

In [83]:
#applying the function
mkt_data=clean_and_process(mkt)

In [84]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mkt_data.drop(['total_claim_amount'], axis=1),mkt_data['total_claim_amount'],test_size=0.2)

In [85]:
#applying linear regression model
Y = y_train
X = X_train[['income','monthly_premium_auto']]
X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()

model.summary()

0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.521
Model:,OLS,Adj. R-squared:,0.521
Method:,Least Squares,F-statistic:,3971.0
Date:,"Mon, 30 Oct 2023",Prob (F-statistic):,0.0
Time:,01:09:48,Log-Likelihood:,-49103.0
No. Observations:,7307,AIC:,98210.0
Df Residuals:,7304,BIC:,98230.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,68.1400,7.465,9.128,0.000,53.506,82.774
income,-0.0033,7.72e-05,-43.225,0.000,-0.003,-0.003
monthly_premium_auto,5.2897,0.069,76.942,0.000,5.155,5.425

0,1,2,3
Omnibus:,838.226,Durbin-Watson:,2.04
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5315.975
Skew:,0.359,Prob(JB):,0.0
Kurtosis:,7.117,Cond. No.,154000.0


In [86]:
#model validation

ypred=round(model.predict(X),2)
r2=round(r2_score(Y,ypred),3)
mse=round(mean_squared_error(Y,ypred),3)
rmse=round(math.sqrt(mean_squared_error(Y,ypred)),3)
mae=round(mean_absolute_error(Y,ypred),3)
print("R2:",r2,"MSE:",mse,"RMSE:",rmse,"MAE:",mae)    

R2: 0.521 MSE: 40222.125 RMSE: 200.555 MAE: 143.409


In [87]:
# if we use all the variables in the dataset, the model would improve (R2 from 0,51 to 0,77, MAE from 146 to 95)
# but the number of independent variables would increase from 2 to 50

Y = y_train
X = X_train
X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()
ypred=round(model.predict(X),2)
r2=round(r2_score(Y,ypred),3)
mse=round(mean_squared_error(Y,ypred),3)
rmse=round(math.sqrt(mean_squared_error(Y,ypred)),3)
mae=round(mean_absolute_error(Y,ypred),3)
print("R2:",r2,"MSE:",mse,"RMSE:",rmse,"MAE:",mae)    

R2: 0.769 MSE: 19403.111 RMSE: 139.295 MAE: 94.713


In [88]:
#let's check which variables should we drop
#identifying which variables are not significant (p-values under 0.05)

pvalues = pd.DataFrame(model.pvalues)
pvalues['variable']=pvalues.index

pvalues.columns = ['pvalue', 'variable']

#filtering only significant variables
pvalues_sig=pvalues[pvalues['pvalue']<0.05]
pvalues_sig

X_sig=X.filter(items=pvalues_sig['variable']).drop(['const'],axis=1)

X_sig.head(3)

Unnamed: 0,income,monthly_premium_auto,response_Yes,coverage_Extended,education_College,education_High School or Below,education_Master,employmentstatus_Unemployed,gender_M,location_code_Suburban,location_code_Urban,marital_status_Single,renew_offer_type_Offer2,renew_offer_type_Offer3,vehicle_class_SUV,vehicle_class_Sports Car
66,22547,112,0,0,1,0,0,0,0,1,0,0,0,0,1,0
1849,78460,64,0,0,1,0,0,0,0,0,1,0,0,1,0,0
7033,0,70,0,0,0,1,0,1,0,1,0,1,0,0,0,0


In [89]:
#removing non-significant variables and running the new model
#reduction from 50 to 16 variables
#R2, MSE and MAE remained almost unchanged so the quality of the model remains and the number of variables decreased a lot

X_sig = sm.add_constant(X_sig)
model = sm.OLS(Y,X_sig).fit()

ypred=round(model.predict(X_sig),2)
r2=round(r2_score(Y,ypred),3)
mse=round(mean_squared_error(Y,ypred),3)
rmse=round(math.sqrt(mean_squared_error(Y,ypred)),3)
mae=round(mean_absolute_error(Y,ypred),3)
print("R2:",r2,"MSE:",mse,"RMSE:",rmse,"MAE:",mae)    

R2: 0.767 MSE: 19521.292 RMSE: 139.719 MAE: 95.078


In [90]:
#trying standardizazion instead of normalization

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
stand_data = scaler.fit_transform(mkt_numerical)
stand_data = pd.DataFrame(stand_data, columns=mkt_numerical.columns)
stand_data.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.42225,-0.822648,-0.16964
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,-0.42225,2.10616,2.400737
2,0.710636,0.36571,0.429596,0.288205,-0.36068,-0.42225,-0.404247,0.455734


In [91]:
mkt_data2 = pd.concat([stand_data,dummy_data],axis=1)

In [92]:
#standardized model did not improve the result compared to the previous one

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mkt_data2.drop(['total_claim_amount'], axis=1),mkt_data2['total_claim_amount'],test_size=0.2)

Y = y_train
X = X_train
X = sm.add_constant(X)
model = sm.OLS(Y,X).fit()

ypred=round(model.predict(X),2)
r2=round(r2_score(Y,ypred),3)
mse=round(mean_squared_error(Y,ypred),3)
rmse=round(math.sqrt(mean_squared_error(Y,ypred)),3)
mae=round(mean_absolute_error(Y,ypred),3)
print("R2:",r2,"MSE:",mse,"RMSE:",rmse,"MAE:",mae)  

R2: 0.773 MSE: 0.226 RMSE: 0.475 MAE: 0.327
