# Lab | Customer Analysis Round 5


## For this lab, we still keep using the marketing_customer_analysis.csv file that you can find in the files_for_lab folder.



In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## We are using the marketing_customer_analysis.csv file.



In [2]:
data = pd.read_csv('marketing_customer_analysis.csv')

In [3]:
data.columns

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount',
       'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [4]:
cols = []
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower()) 
data.columns = cols


In [5]:
data.columns


Index(['customer', 'state', 'customer lifetime value', 'response', 'coverage',
       'education', 'effective to date', 'employmentstatus', 'gender',
       'income', 'location code', 'marital status', 'monthly premium auto',
       'months since last claim', 'months since policy inception',
       'number of open complaints', 'number of policies', 'policy type',
       'policy', 'renew offer type', 'sales channel', 'total claim amount',
       'vehicle class', 'vehicle size'],
      dtype='object')

## Processing Data


### X-y split.


In [6]:
data = data.drop(['customer', 'effective to date'], axis=1)
Y = data['customer lifetime value']
data = data.drop(['customer lifetime value'], axis=1)
X_num = data.select_dtypes(include = np.number)
X_cat = data.select_dtypes(include = np.object)

### Normalize (numerical).


In [7]:
from sklearn.preprocessing import Normalizer

In [8]:
transformer = Normalizer().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)

(9134, 7)


## One Hot/Label Encoding (categorical).


In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(X_cat) 
encoder.categories_

[array(['Arizona', 'California', 'Nevada', 'Oregon', 'Washington'],
       dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Basic', 'Extended', 'Premium'], dtype=object),
 array(['Bachelor', 'College', 'Doctor', 'High School or Below', 'Master'],
       dtype=object),
 array(['Disabled', 'Employed', 'Medical Leave', 'Retired', 'Unemployed'],
       dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Rural', 'Suburban', 'Urban'], dtype=object),
 array(['Divorced', 'Married', 'Single'], dtype=object),
 array(['Corporate Auto', 'Personal Auto', 'Special Auto'], dtype=object),
 array(['Corporate L1', 'Corporate L2', 'Corporate L3', 'Personal L1',
        'Personal L2', 'Personal L3', 'Special L1', 'Special L2',
        'Special L3'], dtype=object),
 array(['Offer1', 'Offer2', 'Offer3', 'Offer4'], dtype=object),
 array(['Agent', 'Branch', 'Call Center', 'Web'], dtype=object),
 array(['Four-Door Car', 'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car',
        'Two-Door Car'],

In [11]:
encoded = encoder.transform(X_cat).toarray()
encoded

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

## Concat DataFrames

In [12]:
X = np.concatenate((x_normalized, encoded), axis=1)
Y

0        2763.519279
1        6979.535903
2       12887.431650
3        7645.861827
4        2813.692575
            ...     
9129    23405.987980
9130     3096.511217
9131     8163.890428
9132     7524.442436
9133     2611.836866
Name: customer lifetime value, Length: 9134, dtype: float64

# Linear Regression


## Train-test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=100)

## Apply linear regression.


In [14]:
lm = linear_model.LinearRegression()

model = lm.fit(X_train,y_train)

In [15]:
predictions  = lm.predict(X_test)

# Model Validation


## MSE

In [16]:
mse = mean_squared_error(y_test, predictions)
print(mse)

42172190.368408084


## RMSE

In [17]:
rmse = math.sqrt(mse)
print(rmse)

6494.011885453251


## R2

In [18]:
r2 = r2_score(y_test, predictions)
print("The R2 value on the TEST set is: ",round(r2,2))

The R2 value on the TEST set is:  0.12


In [19]:
mae = mean_absolute_error(y_test, predictions)
print(mae)

4009.374697909066
