In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('marketing_customer_analysis.csv')
categoricals=df.select_dtypes(include=[object])
numerical=df.select_dtypes(include=[np.number])

In [13]:
categoricals.columns

Index(['Customer', 'State', 'Response', 'Coverage', 'Education',
       'Effective To Date', 'EmploymentStatus', 'Gender', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [7]:
df.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount',
       'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [9]:
X = df.drop(columns = ['Gender', 'Total Claim Amount'])
y = df['Total Claim Amount']

In [10]:
from sklearn.model_selection import train_test_split

In [14]:
# OneHotEncoder
X_clean = pd.get_dummies(X, columns = ['Customer', 'State', 'Response', 'Coverage', 'Education',
       'Effective To Date', 'EmploymentStatus', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, train_size = 0.8, random_state = 419)

In [17]:
X_clean.columns

(9134, 9255)
(7307, 9255)
(1827, 9255)
(9134,)
(7307,)
(1827,)


In [18]:
X.index

RangeIndex(start=0, stop=9134, step=1)

In [19]:
y.index

RangeIndex(start=0, stop=9134, step=1)

In [21]:
from sklearn.linear_model import LinearRegression

In [23]:
linreg = LinearRegression().fit(X_train, y_train)

In [24]:
from sklearn import metrics

In [25]:
predicciones_train = linreg.predict(X_train)
predicciones_test = linreg.predict(X_test)

In [26]:
print(metrics.r2_score(y_train, predicciones_train))
print(metrics.r2_score(y_test, predicciones_test))

1.0
0.7575334719555136


In [27]:
print(metrics.mean_absolute_error(y_train, predicciones_train))
print(metrics.mean_absolute_error(y_test, predicciones_test))

4.563459984233653e-09
97.87775862095553


In [28]:
y_train.describe()

count    7307.000000
mean      434.656430
std       289.529538
min         0.423310
25%       270.377414
50%       386.031248
75%       552.000000
max      2759.794354
Name: Total Claim Amount, dtype: float64

In [29]:
print(metrics.mean_absolute_percentage_error(y_train, predicciones_train))
print(metrics.mean_absolute_percentage_error(y_test, predicciones_test))

3.367398901250959e-11
1.222822653216282


### MODEL MAY BE OVERFITTED

In [31]:
import pickle

In [32]:
with open('modelo.pickle', 'wb') as f:
    pickle.dump(linreg, f)

In [33]:
with open('modelo.pickle', 'rb') as f:
    modelo = pickle.load(f)

In [34]:
linreg.coef_ == modelo.coef_

array([ True,  True,  True, ...,  True,  True,  True])

In [37]:
from feature_engine.encoding import OrdinalEncoder

In [38]:
od = OrdinalEncoder(encoding_method='ordered', variables =['Customer', 'State', 'Response', 'Coverage', 'Education',
       'Effective To Date', 'EmploymentStatus', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size'])
od.fit(X, y)

OrdinalEncoder(variables=['Customer', 'State', 'Response', 'Coverage',
                          'Education', 'Effective To Date', 'EmploymentStatus',
                          'Location Code', 'Marital Status', 'Policy Type',
                          'Policy', 'Renew Offer Type', 'Sales Channel',
                          'Vehicle Class', 'Vehicle Size'])

In [40]:
X = od.transform(X)

In [42]:
X.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Income,Location Code,...,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
0,4595,2,2763.519279,0,0,3,12,0,56274,2,...,32,5,0,1,0,0,3,3,0,0
1,8854,0,6979.535903,0,1,3,6,4,0,2,...,13,42,0,8,1,2,2,3,1,0
2,7076,4,12887.43165,0,2,3,38,0,48767,2,...,18,38,0,2,1,2,3,3,0,0
3,6621,3,7645.861827,0,0,3,21,4,0,2,...,18,65,0,7,0,6,3,0,3,0
4,1233,2,2813.692575,0,0,3,1,0,43836,0,...,12,44,0,1,1,7,3,3,1,0


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 419)

In [45]:
linreg_new = LinearRegression()

In [46]:
linreg_new.fit(X_train, y_train)

LinearRegression()

In [56]:
linreg_new.coef_

array([ 7.61898105e-02,  9.89355434e-01, -4.92048882e-04, -1.84824650e+01,
       -5.11778790e+01,  2.60738009e+00,  2.34602560e-01,  7.78918383e+00,
       -1.34894030e-04,  1.69445848e+01,  1.58607175e+01,  3.66973764e+00,
        3.15913503e-01,  6.05324061e-02,  1.04462028e+00,  6.50729876e-01,
        9.87925415e-02,  7.26589581e-01, -5.20094987e+00,  1.43879735e+00,
       -3.29181244e+01,  9.58451327e-01])

In [47]:
predicciones_train = linreg_new.predict(X_train)
predicciones_test = linreg_new.predict(X_test)

In [48]:
print(metrics.r2_score(y_train, predicciones_train))
print(metrics.r2_score(y_test, predicciones_test))

0.8608022786074015
0.8488610132180682


In [49]:
print(metrics.mean_absolute_error(y_train, predicciones_train))
print(metrics.mean_absolute_error(y_test, predicciones_test))

64.82592116273045
66.39616608292009


In [51]:
y_train.describe()

count    7307.000000
mean      434.656430
std       289.529538
min         0.423310
25%       270.377414
50%       386.031248
75%       552.000000
max      2759.794354
Name: Total Claim Amount, dtype: float64

In [53]:
X_train.describe()


Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Income,Location Code,...,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
count,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,...,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0
mean,4578.715615,1.872998,7995.750433,0.143287,0.483509,2.717121,28.943342,1.237033,37777.477077,1.439168,...,15.038867,48.436568,0.390311,2.97304,0.823594,3.108663,1.868482,1.715889,1.37991,0.488025
std,2645.00574,1.322288,6836.349505,0.35039,0.657535,1.083452,16.878661,1.733056,30386.436616,0.795184,...,10.039103,27.679161,0.918737,2.394494,0.474019,2.12244,1.081829,1.161959,1.168841,0.797246
min,2.0,0.0,1898.683686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2270.5,1.0,4015.066488,0.0,0.0,2.0,14.0,0.0,0.0,1.0,...,6.0,25.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0
50%,4599.0,2.0,5771.610372,0.0,0.0,3.0,29.0,0.0,34133.0,2.0,...,14.0,48.0,0.0,2.0,1.0,2.0,2.0,2.0,1.0,0.0
75%,6888.0,3.0,8971.393809,0.0,1.0,4.0,43.0,4.0,62450.0,2.0,...,23.0,71.0,0.0,4.0,1.0,4.0,3.0,3.0,2.0,1.0
max,9132.0,4.0,83325.38119,1.0,2.0,4.0,58.0,4.0,99961.0,2.0,...,35.0,99.0,5.0,9.0,2.0,8.0,3.0,3.0,5.0,2.0


In [60]:
X_train.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Income',
       'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Vehicle Class',
       'Vehicle Size'],
      dtype='object')

In [50]:
print(metrics.mean_absolute_percentage_error(y_train, predicciones_train))
print(metrics.mean_absolute_percentage_error(y_test, predicciones_test))

0.379198241361757
0.5914341737063215


## The errors in predictions went down and the model doesn't seem to be overfitted any longer.

## There are some X values whose coefficient is lower than 0.1, they are removed before the next fit:

In [62]:
X.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Income,Location Code,...,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
0,4595,2,2763.519279,0,0,3,12,0,56274,2,...,32,5,0,1,0,0,3,3,0,0
1,8854,0,6979.535903,0,1,3,6,4,0,2,...,13,42,0,8,1,2,2,3,1,0
2,7076,4,12887.43165,0,2,3,38,0,48767,2,...,18,38,0,2,1,2,3,3,0,0
3,6621,3,7645.861827,0,0,3,21,4,0,2,...,18,65,0,7,0,6,3,0,3,0
4,1233,2,2813.692575,0,0,3,1,0,43836,0,...,12,44,0,1,1,7,3,3,1,0


In [77]:
X_small=X.drop(['Customer',  'Customer Lifetime Value',  'Months Since Policy Inception'],axis=1, )

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_small, y, train_size = 0.8, random_state = 419)

In [79]:
linreg_small = LinearRegression()

In [80]:
linreg_small.fit(X_train, y_train)

LinearRegression()

In [81]:
linreg_small.coef_

array([ 1.87504867e+00, -3.51555207e+01, -1.15768989e+01,  6.65577386e+00,
        2.78834037e-01,  1.74812408e+01, -2.38773389e-04,  1.87072844e+02,
        3.31862728e+01,  5.24633977e+00,  4.31634037e-01, -1.43080490e+00,
        1.15350693e+00,  4.12508832e-01,  1.27945899e+00, -8.40462535e+00,
        3.41692651e-02, -6.06306500e+00,  5.41401693e+00])

In [83]:
predicciones_train = linreg_small.predict(X_train)
predicciones_test = linreg_small.predict(X_test)

In [84]:
print(metrics.r2_score(y_train, predicciones_train))
print(metrics.r2_score(y_test, predicciones_test))

0.7685273948697142
0.7591843379514913


In [85]:
print(metrics.mean_absolute_error(y_train, predicciones_train))
print(metrics.mean_absolute_error(y_test, predicciones_test))

96.93381162858314
98.53119356408715


In [86]:
y_train.describe()

count    7307.000000
mean      434.656430
std       289.529538
min         0.423310
25%       270.377414
50%       386.031248
75%       552.000000
max      2759.794354
Name: Total Claim Amount, dtype: float64

## r2 is smaller, but the mean absolute error is greater.