In [15]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 

In [16]:
data = pd.read_csv('marketing_customer_analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [17]:
data.drop(['Customer','Effective To Date'], axis = 1, inplace = True)

In [18]:
categoricals = data.select_dtypes('object')

In [19]:
categoricals.head()

Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
0,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize


In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
for col in categoricals.columns:
    categoricals[col]=LabelEncoder().fit(categoricals[col]).transform(categoricals[col])

In [22]:
numerical = data._get_numeric_data()

In [23]:
numerical.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.43165,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879


In [24]:
X = pd.DataFrame(np.concatenate((numerical, categoricals), axis=1))

In [25]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,2763.519279,56274.0,69.0,32.0,5.0,0.0,1.0,384.811147,4.0,0.0,...,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,5.0,1.0
1,6979.535903,0.0,94.0,13.0,42.0,0.0,8.0,1131.464935,0.0,0.0,...,4.0,0.0,1.0,2.0,1.0,5.0,2.0,0.0,0.0,1.0
2,12887.43165,48767.0,108.0,18.0,38.0,0.0,2.0,566.472247,2.0,0.0,...,1.0,0.0,1.0,1.0,1.0,5.0,0.0,0.0,5.0,1.0
3,7645.861827,0.0,106.0,18.0,65.0,0.0,7.0,529.881344,1.0,0.0,...,4.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,1.0
4,2813.692575,43836.0,73.0,12.0,44.0,0.0,1.0,138.130879,4.0,0.0,...,1.0,1.0,0.0,2.0,1.0,3.0,0.0,0.0,0.0,1.0


In [26]:
X = numerical.drop(['Total Claim Amount'],axis = 1)
Y = numerical['Total Claim Amount']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.4,random_state=100)

In [28]:
model = sm.OLS(y_train, X_train).fit()

In [29]:
predictions = model.predict(X_test)

In [30]:
r2_score(y_test, predictions)

0.48288009054567327

In [32]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(mse)

41086.679470476054


In [33]:
import math

In [34]:
rmse = math.sqrt(mse)
print(rmse)

202.6984940014998


In [35]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test,predictions)
print (mae)

146.26196404048633


In [39]:
corr_matrix = numpy.corrcoef(y_test, predictions)
corr = corr_matrix[0,1]
R_sq = corr**2
 
print(R_sq)

0.4858636086829185
