## 00 |

The objective of this **project** is to define the profile of the most profitable customers, based on a collection of **9.134** customer data, including demographics and buying behaviour.

In [1]:
# üìö Basic libraries
import os # file managment
import pandas as pd # data manipulation
import numpy as np # numerical operations
import math # math in-build
import matplotlib.pyplot as plt # 2D visualization
import seaborn as sns # HD visualization
import warnings # warning messages managment

# ü§ñ Machine Learning
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # splitting data into train/test sets
from sklearn.linear_model import LinearRegression # logistic model
from sklearn.linear_model import ElasticNet, Lasso, Ridge # Regression model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # model evaluation metrics

# ‚öôÔ∏è Settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore') # ignore warnings

# üîÑ Functions
import sys # system path to our functions
sys.path.append("C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_4_py/lab-cleaning-numerical-data")

from easy.functions import open_data # quick data overview
from easy.functions import snake_columns # snake_case
from easy.functions import explore_data # checks for duplicates, NaN & empty spaces

In [2]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_4_py/lab-cleaning-numerical-data/01_data/data_c.csv")
data_c = pd.read_csv(file_path)

## 02 | Data Cleaning

In [3]:
snake_columns(data_c)

Unnamed: 0,unnamed:_0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,marital_status,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size


In [4]:
explore_data(data_c)

There are 0 duplicate rows. Also;


Unnamed: 0,EmptySpaces,NaN,Null
unnamed:_0,0,0,0
customer,0,0,0
state,0,0,0
customer_lifetime_value,0,0,0
response,0,0,0
coverage,0,0,0
education,0,0,0
effective_to_date,0,0,0
employmentstatus,0,0,0
gender,0,0,0


In [5]:
# Ok, nothing to do here. 
# Moving on!

### Selecting Numericals

In [6]:
n = data_c.select_dtypes(include=np.number)
n.sample(3)

Unnamed: 0,unnamed:_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
1795,1795,2255.312942,58289,62,18,92,4,1,297.6
7952,7952,4167.994667,0,69,22,98,3,8,433.959806
1945,1945,17476.64139,45106,73,22,21,0,2,350.4


In [7]:
# All good. Nothing to do here until pre-processing
# We will use `total_claim_amount` as the target (already in the right)

### Selecting Categoricals

In [8]:
c = data_c.select_dtypes(exclude=np.number)
c.sample(3)

Unnamed: 0,customer,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
3060,KU14261,Arizona,No,Extended,Master,2/4/11,Employed,F,Urban,Married,Personal Auto,Personal L1,Offer1,Call Center,Four-Door Car,Medsize
2895,XQ74827,Arizona,No,Extended,High School or Below,2/1/11,Employed,F,Suburban,Single,Special Auto,Special L3,Offer1,Agent,Luxury Car,Medsize
4405,WJ88394,Nevada,No,Basic,College,1/29/11,Medical Leave,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Branch,SUV,Medsize


### Dropping unnecessary features

In [9]:
# We will drop `customer` since it's an ID
c = c.drop(['customer'], axis=1)

### Encoding Categoricals
* We will count `unique` for each feature.
* **If** it follows an hierarchy, ordinal encoding. **Elif**, manual encoding. **Elif** (too many uniques), get dummies. **Else** (dates), transform it to a datetime object and then create new columns for `day`, `month` & `year`

In [10]:
# One by one, we will check unique values to encode them manually if it's necessary
c['response'].unique()

array(['No', 'Yes'], dtype=object)

In [11]:
binary = {'No' : 0, 'Yes' : 1}
c['response'].replace(binary, inplace=True)

In [12]:
c['coverage'].unique()

array(['Basic', 'Extended', 'Premium'], dtype=object)

In [13]:
# In this case, ordinal encoding. Premium > Extended > Basic
ordinal = {'Basic' : 0, 'Extended' : 1, 'Premium' : 2}
c['coverage'].replace(ordinal, inplace=True)

In [14]:
c['education'].unique()

array(['Bachelor', 'College', 'Master', 'High School or Below', 'Doctor'],
      dtype=object)

In [15]:
# Then again, ordinal. Doctor > Master > College > Bachelor > High School or Below
ordinal = {'High School or Below' : 0, 'Bachelor' : 1, 'College' : 2, 'Master' : 3, 'Doctor' : 4}
c['education'].replace(ordinal, inplace=True)

In [16]:
# Dates are complex. First, we will change it to datetime format
c['effective_to_date'] = c['effective_to_date'].astype('datetime64[ns]')

In [17]:
c['year'] = c['effective_to_date'].dt.year
c['month'] = c['effective_to_date'].dt.month
c['day'] = c['effective_to_date'].dt.day

In [18]:
c.head(3) # To see the changes

Unnamed: 0,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size,year,month,day
0,Washington,0,0,1,2011-02-24,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize,2011,2,24
1,Arizona,0,1,1,2011-01-31,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize,2011,1,31
2,Nevada,0,2,1,2011-02-19,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize,2011,2,19


In [19]:
# We then drop `effective_to_date`
c = c.drop(['effective_to_date'], axis=1)

In [20]:
c.head(3)

Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size,year,month,day
0,Washington,0,0,1,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize,2011,2,24
1,Arizona,0,1,1,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize,2011,1,31
2,Nevada,0,2,1,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize,2011,2,19


In [21]:
# Next, employmentstatus:
c['employmentstatus'].unique() # In this case, we will use get_dummies, since we don't want to represent a hierarchy

array(['Employed', 'Unemployed', 'Medical Leave', 'Disabled', 'Retired'],
      dtype=object)

In [22]:
c['gender'].unique() # We have two genders in this dataset, so get_dummies

array(['F', 'M'], dtype=object)

In [23]:
c['location_code'].unique() # Again, we don't want to show any hierarchy so we will use get_dummies

array(['Suburban', 'Rural', 'Urban'], dtype=object)

In [24]:
c['marital_status'].unique() # get_dummies

array(['Married', 'Single', 'Divorced'], dtype=object)

In [25]:
c['policy_type'].unique()

array(['Corporate Auto', 'Personal Auto', 'Special Auto'], dtype=object)

In [26]:
# Then again, hierarchy. Special Auto > Corporate Auto > Personal Auto
ordinal = {'Personal Auto' : 0, 'Corporate Auto' : 1, 'Special Auto' : 2}
c['policy_type'].replace(ordinal, inplace=True)

In [27]:
c['policy'].unique() # get_dummies

array(['Corporate L3', 'Personal L3', 'Corporate L2', 'Personal L1',
       'Special L2', 'Corporate L1', 'Personal L2', 'Special L1',
       'Special L3'], dtype=object)

In [28]:
# Then again, hierarchy. Special L3 > Special L2 > Special L1 > Corporate L3 > Corporate L2 > Corporate L1 > Personal L3 > Personal L2 > Personal L1
ordinal = {'Personal L1' : 0, 'Personal L2' : 1, 'Personal L3': 2, 'Corporate L1' : 3, 'Corporate L2' : 4, 'Corporate L3' : 5, 'Special L1' : 6, 'Special L2' : 7, 'Special L3' : 8}
c['policy'].replace(ordinal, inplace=True)

In [29]:
c['renew_offer_type'].unique() # get_dummies, we don't know the hierarchy of the offers

array(['Offer1', 'Offer3', 'Offer2', 'Offer4'], dtype=object)

In [30]:
c['sales_channel'].unique() # get_dummies

array(['Agent', 'Call Center', 'Web', 'Branch'], dtype=object)

In [31]:
c['vehicle_class'].unique() # There is a clear hierarchy Luxury > Sports but not with the others. We will use get_dummies

array(['Two-Door Car', 'Four-Door Car', 'SUV', 'Luxury SUV', 'Sports Car',
       'Luxury Car'], dtype=object)

In [32]:
c['vehicle_size'].unique()

array(['Medsize', 'Small', 'Large'], dtype=object)

In [33]:
ordinal = {'Small' : 0, 'Medsize' : 1, 'Large': 2}
c['vehicle_size'].replace(ordinal, inplace=True)

In [34]:
# We now select all our categoricals encoded before applying get dummies
c_n = c.select_dtypes(include = np.number)
c_n.head(3)

Unnamed: 0,response,coverage,education,policy_type,policy,vehicle_size,year,month,day
0,0,0,1,1,5,1,2011,2,24
1,0,1,1,0,2,1,2011,1,31
2,0,2,1,0,2,1,2011,2,19


In [35]:
# We concat them to check it with a correlation matrix (we have ordinal categoricals, so it makes sense)
X_N = pd.concat([c_n, n], axis=1) # we concat them with our numerical values, target at our righ
X_N.head(3)

Unnamed: 0,response,coverage,education,policy_type,policy,vehicle_size,year,month,day,unnamed:_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,0,0,1,1,5,1,2011,2,24,0,2763.519279,56274,69,32,5,0,1,384.811147
1,0,1,1,0,2,1,2011,1,31,1,6979.535903,0,94,13,42,0,8,1131.464935
2,0,2,1,0,2,1,2011,2,19,2,12887.43165,48767,108,18,38,0,2,566.472247


In [36]:
# Saved for later
X_N.to_csv('C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_4_py/lab-cleaning-numerical-data/01_data/X_N.csv')

In [37]:
# Now again, we select only categoricals to encode them with get_dummies
c  = c.select_dtypes(exclude = np.number)
c.head(3)

Unnamed: 0,state,employmentstatus,gender,location_code,marital_status,renew_offer_type,sales_channel,vehicle_class
0,Washington,Employed,F,Suburban,Married,Offer1,Agent,Two-Door Car
1,Arizona,Unemployed,F,Suburban,Single,Offer3,Agent,Four-Door Car
2,Nevada,Employed,F,Suburban,Married,Offer1,Agent,Two-Door Car


In [38]:
# Now, get_dummies
c_dumm = pd.get_dummies(c, drop_first=False)
c_dumm.sample(5)

Unnamed: 0,state_Arizona,state_California,state_Nevada,state_Oregon,state_Washington,employmentstatus_Disabled,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,gender_F,gender_M,location_code_Rural,location_code_Suburban,location_code_Urban,marital_status_Divorced,marital_status_Married,marital_status_Single,renew_offer_type_Offer1,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
8608,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
2469,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0
4827,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
629,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
2038,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0


In [39]:
c_dumm.to_csv('C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_4_py/lab-cleaning-numerical-data/01_data/c_dumm.csv')