In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

In [2]:
data = pd.read_csv("marketing_customer_analysis.csv")
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [3]:
cols = []
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower())
data.columns = cols
cols

['customer',
 'state',
 'customer lifetime value',
 'response',
 'coverage',
 'education',
 'effective to date',
 'employmentstatus',
 'gender',
 'income',
 'location code',
 'marital status',
 'monthly premium auto',
 'months since last claim',
 'months since policy inception',
 'number of open complaints',
 'number of policies',
 'policy type',
 'policy',
 'renew offer type',
 'sales channel',
 'total claim amount',
 'vehicle class',
 'vehicle size']

In [4]:
#DROP VALUES WITH MORE THAN 10 UNIQUE VALUES
categorical = data.select_dtypes(exclude = np.number)
#numerical.head()
categorical = categorical.drop(['customer','effective to date'], axis = 1)
categorical.head()

Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location code,marital status,policy type,policy,renew offer type,sales channel,vehicle class,vehicle size
0,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize


In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer lifetime value,9134.0,8004.940475,6870.967608,1898.007675,3994.251794,5780.182197,8962.167041,83325.38119
income,9134.0,37657.380009,30379.904734,0.0,0.0,33889.5,62320.0,99981.0
monthly premium auto,9134.0,93.219291,34.407967,61.0,68.0,83.0,109.0,298.0
months since last claim,9134.0,15.097,10.073257,0.0,6.0,14.0,23.0,35.0
months since policy inception,9134.0,48.064594,27.905991,0.0,24.0,48.0,71.0,99.0
number of open complaints,9134.0,0.384388,0.910384,0.0,0.0,0.0,0.0,5.0
number of policies,9134.0,2.96617,2.390182,1.0,1.0,2.0,4.0,9.0
total claim amount,9134.0,434.088794,290.500092,0.099007,272.258244,383.945434,547.514839,2893.239678


In [6]:
numerical = data.select_dtypes(include = np.number)
numerical.head()

Unnamed: 0,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,total claim amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.43165,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879


In [7]:
#CHECK FOR NAN VALUES IN NUMERICAL COLUMNS OF THE DATA FRAME
numerical.isna().sum()

customer lifetime value          0
income                           0
monthly premium auto             0
months since last claim          0
months since policy inception    0
number of open complaints        0
number of policies               0
total claim amount               0
dtype: int64

In [8]:
# from sklearn.preprocessing import MinMaxScaler
# kimera = MinMaxScaler().fit(numerical)
# x_normalized = kimera.transform(numerical)
# print(x_normalized.shape)
# x_normalized = pd.DataFrame(x_normalized,columns=numerical.columns)
# x_normalized.head()

In [9]:
#USE A DIFFERENT SCALAR
from sklearn.preprocessing import StandardScaler
kimera = StandardScaler().fit(numerical)
x_standardized = kimera.transform(numerical)
print(x_standardized.shape)
x_standardized = pd.DataFrame(x_standardized,columns=numerical.columns)
x_standardized.head()

(9134, 8)


Unnamed: 0,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,total claim amount
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.42225,-0.822648,-0.16964
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,-0.42225,2.10616,2.400737
2,0.710636,0.36571,0.429596,0.288205,-0.36068,-0.42225,-0.404247,0.455734
3,-0.052263,-1.239617,0.371467,0.288205,0.606907,-0.42225,1.687759,0.329769
4,-0.755575,0.20339,-0.587666,-0.307465,-0.145661,-0.42225,-0.822648,-1.018843


In [53]:
# #USE LOG 10
# def kimera(x):
#     x = np.log10(x)
#     if np.isfinite(x):
#         return x
#     else:
#         return 0

# for cols in numerical.columns:
#     numerical[cols] = numerical[cols].apply(lambda x : kimera(x))
# numerical.head()

In [10]:
#TRY ONE ENCONDING. ALTHOUGH POLICY HAS SO MANY UNIQUE VALUES
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(categorical)

#print(encoder.categories_) - DOES NOT WORK. WHY???
#print(encoder.categories_)
encoded = encoder.transform(categorical).toarray()
#print(encoded)

#ENCODED IS A NESTED ARRAY
final_cols = []
for item in encoder.categories_:
    #print(item)
    final_cols.extend(item)
print(final_cols)
encoded = encoder.transform(categorical).toarray()
#print(encoded)
onehot_encoded = pd.DataFrame(encoded,columns=final_cols)
onehot_encoded.head()

['Arizona', 'California', 'Nevada', 'Oregon', 'Washington', 'No', 'Yes', 'Basic', 'Extended', 'Premium', 'Bachelor', 'College', 'Doctor', 'High School or Below', 'Master', 'Disabled', 'Employed', 'Medical Leave', 'Retired', 'Unemployed', 'F', 'M', 'Rural', 'Suburban', 'Urban', 'Divorced', 'Married', 'Single', 'Corporate Auto', 'Personal Auto', 'Special Auto', 'Corporate L1', 'Corporate L2', 'Corporate L3', 'Personal L1', 'Personal L2', 'Personal L3', 'Special L1', 'Special L2', 'Special L3', 'Offer1', 'Offer2', 'Offer3', 'Offer4', 'Agent', 'Branch', 'Call Center', 'Web', 'Four-Door Car', 'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car', 'Two-Door Car', 'Large', 'Medsize', 'Small']


Unnamed: 0,Arizona,California,Nevada,Oregon,Washington,No,Yes,Basic,Extended,Premium,...,Web,Four-Door Car,Luxury Car,Luxury SUV,SUV,Sports Car,Two-Door Car,Large,Medsize,Small
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [54]:
#new_data = pd.concat([x_standardized,onehot_encoded], axis =1)
new_data = pd.concat([x_standardized,onehot_encoded], axis =1)
new_data.head()


Unnamed: 0,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,total claim amount,Arizona,California,...,Web,Four-Door Car,Luxury Car,Luxury SUV,SUV,Sports Car,Two-Door Car,Large,Medsize,Small
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.42225,-0.822648,-0.16964,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,-0.42225,2.10616,2.400737,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.710636,0.36571,0.429596,0.288205,-0.36068,-0.42225,-0.404247,0.455734,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.052263,-1.239617,0.371467,0.288205,0.606907,-0.42225,1.687759,0.329769,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.755575,0.20339,-0.587666,-0.307465,-0.145661,-0.42225,-0.822648,-1.018843,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
new_data.columns

Index(['customer lifetime value', 'income', 'monthly premium auto',
       'months since last claim', 'months since policy inception',
       'number of open complaints', 'number of policies', 'total claim amount',
       'Arizona', 'California', 'Nevada', 'Oregon', 'Washington', 'No', 'Yes',
       'Basic', 'Extended', 'Premium', 'Bachelor', 'College', 'Doctor',
       'High School or Below', 'Master', 'Disabled', 'Employed',
       'Medical Leave', 'Retired', 'Unemployed', 'F', 'M', 'Rural', 'Suburban',
       'Urban', 'Divorced', 'Married', 'Single', 'Corporate Auto',
       'Personal Auto', 'Special Auto', 'Corporate L1', 'Corporate L2',
       'Corporate L3', 'Personal L1', 'Personal L2', 'Personal L3',
       'Special L1', 'Special L2', 'Special L3', 'Offer1', 'Offer2', 'Offer3',
       'Offer4', 'Agent', 'Branch', 'Call Center', 'Web', 'Four-Door Car',
       'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car', 'Two-Door Car',
       'Large', 'Medsize', 'Small'],
      dtype='object

In [55]:
from sklearn.model_selection import train_test_split
#SET NEW FEATURES TO THE NEWLY CREATED DATA FRAME
X = new_data.drop(['total claim amount'], axis = 1)
y = new_data['total claim amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7307, 64)
(1827, 64)
(7307,)
(1827,)


In [57]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [58]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.7743062746424321

In [59]:
#EVALUATING THE MODEL USING THE TEST DATA

predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.7626491314135815

In [60]:
predictions

array([-0.83459473,  0.09985352,  0.06469727, ..., -0.80615234,
        0.30432129,  0.74847412])

In [65]:
from sklearn.metrics import mean_squared_error, r2_score

In [67]:
mse = mean_squared_error(y_test, predictions)
print(mse)

0.2236522467219877


In [68]:
mse = mean_squared_error(y_test, predictions)
print(mse)

0.2236522467219877


In [69]:
rmse = math.sqrt(mse)
print(rmse)

0.4729188584968754


In [70]:
r2 = r2_score(y_test, predictions)
r2

0.7626491314135815

In [None]:
#FIRST CONCLUSION
#mse VALUES ARE VERY DIFFERENT (18871.7:0.223)
#THE R2 SCORES ARE SIMILAR

In [None]:
#REDUCING NUMBER OF COLUMNS IN A CATEGORICAL DATAFRAMES

In [72]:
#categorical = categorical.drop([''], axis = 1)
categorical.columns

Index(['state', 'response', 'coverage', 'education', 'employmentstatus',
       'gender', 'location code', 'marital status', 'policy type', 'policy',
       'renew offer type', 'sales channel', 'vehicle class', 'vehicle size'],
      dtype='object')

In [73]:
categorical = categorical[['response', 'coverage', 'education', 'employmentstatus',
       'gender', 'location code', 'marital status', 'policy type', 'policy',
       'renew offer type','vehicle size']]
categorical.columns

Index(['response', 'coverage', 'education', 'employmentstatus', 'gender',
       'location code', 'marital status', 'policy type', 'policy',
       'renew offer type', 'vehicle size'],
      dtype='object')

In [76]:
#TRY ONE ENCONDING. ALTHOUGH POLICY HAS SO MANY UNIQUE VALUES
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(categorical)

#print(encoder.categories_) - DOES NOT WORK. WHY???
#print(encoder.categories_)
encoded = encoder.transform(categorical).toarray()
#print(encoded)

#ENCODED IS A NESTED ARRAY
final_cols = []
for item in encoder.categories_:
    #print(item)
    final_cols.extend(item)
print(final_cols)
encoded1 = encoder.transform(categorical).toarray()
#print(encoded)
onehot_encoded1 = pd.DataFrame(encoded1,columns=final_cols)
onehot_encoded.head()

['No', 'Yes', 'Basic', 'Extended', 'Premium', 'Bachelor', 'College', 'Doctor', 'High School or Below', 'Master', 'Disabled', 'Employed', 'Medical Leave', 'Retired', 'Unemployed', 'F', 'M', 'Rural', 'Suburban', 'Urban', 'Divorced', 'Married', 'Single', 'Corporate Auto', 'Personal Auto', 'Special Auto', 'Corporate L1', 'Corporate L2', 'Corporate L3', 'Personal L1', 'Personal L2', 'Personal L3', 'Special L1', 'Special L2', 'Special L3', 'Offer1', 'Offer2', 'Offer3', 'Offer4', 'Large', 'Medsize', 'Small']


Unnamed: 0,No,Yes,Basic,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,...,Special L1,Special L2,Special L3,Offer1,Offer2,Offer3,Offer4,Large,Medsize,Small
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [77]:
#new_data = pd.concat([x_standardized,onehot_encoded], axis =1)
new_data1 = pd.concat([x_standardized,onehot_encoded1], axis =1)
new_data1.head()

Unnamed: 0,customer lifetime value,income,monthly premium auto,months since last claim,months since policy inception,number of open complaints,number of policies,total claim amount,No,Yes,...,Special L1,Special L2,Special L3,Offer1,Offer2,Offer3,Offer4,Large,Medsize,Small
0,-0.762878,0.612827,-0.703925,1.678099,-1.543287,-0.42225,-0.822648,-0.16964,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.149245,-1.239617,0.022691,-0.208186,-0.217334,-0.42225,2.10616,2.400737,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.710636,0.36571,0.429596,0.288205,-0.36068,-0.42225,-0.404247,0.455734,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.052263,-1.239617,0.371467,0.288205,0.606907,-0.42225,1.687759,0.329769,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.755575,0.20339,-0.587666,-0.307465,-0.145661,-0.42225,-0.822648,-1.018843,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [78]:
from sklearn.model_selection import train_test_split
#SET NEW FEATURES TO THE NEWLY CREATED DATA FRAME
X = new_data1.drop(['total claim amount'], axis = 1)
y = new_data['total claim amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7307, 49)
(1827, 49)
(7307,)
(1827,)


In [80]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [81]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.7729339060348581

In [83]:
#EVALUATING THE MODEL USING THE TEST DATA
predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.763988843104668

In [84]:
from sklearn.metrics import mean_squared_error, r2_score

In [85]:
mse = mean_squared_error(y_test, predictions)
print(mse)

0.22238985601975136


In [86]:
r2 = r2_score(y_test, predictions)
r2

0.763988843104668

In [None]:
#DROPPING COLUMNS IN CATEGORICAL COLUMNS HAS LIMITED IMPACT ON THE MODEL ACCURACY