In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#We will start with removing outliers. So far, we have discussed different methods to remove outliers. Use the one you
# feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the
# dataframe.
#Create a copy of the dataframe for the data wrangling.

In [3]:
numerical = pd.read_csv('numerical.csv', index_col=0)
numerical.head()

Unnamed: 0_level_0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BU79786,56274,69,32,5,0,1
QZ44356,0,94,13,42,0,8
AI49188,48767,108,18,38,0,2
WW63253,0,106,18,65,0,7
HB64268,43836,73,12,44,0,1


In [4]:
numerical['income'].value_counts()

0        2317
95697      12
27972       7
25370       7
61108       7
         ... 
36529       1
25859       1
38179       1
70190       1
21941       1
Name: income, Length: 5694, dtype: int64

In [5]:
numerical = numerical.reset_index(drop=True)
numerical['income'] = numerical['income'].apply(lambda x: np.mean(numerical['income']) if x == 0 else x)

In [6]:
numerical['income'].value_counts()

37657.380009    2317
95697.000000      12
27972.000000       7
25370.000000       7
61108.000000       7
                ... 
36529.000000       1
25859.000000       1
38179.000000       1
70190.000000       1
21941.000000       1
Name: income, Length: 5694, dtype: int64

In [7]:
#Normalize the continuous variables. You can use any one method you want.
continuous_var  = numerical[['income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception']]

In [8]:
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(continuous_var)  #this reduces the impact of outliers on the data
continuous_normalized = transformer.transform(continuous_var)
continuous_normalized = pd.DataFrame(continuous_normalized, columns=continuous_var.columns)

In [9]:
continuous_normalized

Unnamed: 0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,0.999999,0.001226,0.000569,0.000089
1,0.999996,0.002496,0.000345,0.001115
2,0.999997,0.002215,0.000369,0.000779
3,0.999994,0.002815,0.000478,0.001726
4,0.999998,0.001665,0.000274,0.001004
...,...,...,...,...
9129,0.999999,0.001015,0.000250,0.001237
9130,0.999992,0.003657,0.000648,0.001296
9131,0.999997,0.002257,0.000239,0.000983
9132,0.999989,0.004375,0.001550,0.000137


In [26]:
discrete = numerical[['number_of_open_complaints','number_of_policies']]
discrete.head()

Unnamed: 0,number_of_open_complaints,number_of_policies
0,0,1
1,0,8
2,0,2
3,0,7
4,0,1


In [27]:
from pandas import concat
new_num = concat(
    [
        continuous_normalized.reset_index(drop=True),
        discrete.reset_index(drop=True)
    ],
    axis=1,
    ignore_index=True,
)

concatenated_dataframes_columns = [
    list(continuous_normalized.columns),
    list(discrete.columns)
]
    
flatten = lambda nested_lists: [item for sublist in nested_lists for item in sublist]

new_num.columns = flatten(concatenated_dataframes_columns)
new_num.head()

Unnamed: 0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,0.999999,0.001226,0.000569,8.9e-05,0,1
1,0.999996,0.002496,0.000345,0.001115,0,8
2,0.999997,0.002215,0.000369,0.000779,0,2
3,0.999994,0.002815,0.000478,0.001726,0,7
4,0.999998,0.001665,0.000274,0.001004,0,1


In [28]:
new_num.to_csv('numeric.csv', index=True)

In [10]:
#Encode the categorical variables
categorical = pd.read_csv('categorical2.csv', index_col=0)
categorical.head()

Unnamed: 0_level_0,state,response,coverage,education,effective_to_date,employment_status,gender,location_code,marital_status,policy_type,sales_channel,vehicle_class
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BU79786,Washington,No,Basic,Bachelor,2/24/11,Employed,F,Suburban,Married,Corporate Auto,Agent,Two-Door Car
QZ44356,Arizona,No,Extended,Bachelor,1/31/11,Unemployed,F,Suburban,Single,Personal Auto,Agent,Four-Door Car
AI49188,Nevada,No,Premium,Bachelor,2/19/11,Employed,F,Suburban,Married,Personal Auto,Agent,Two-Door Car
WW63253,California,No,Basic,Bachelor,1/20/11,Unemployed,M,Suburban,Married,Corporate Auto,Call Center,SUV
HB64268,Washington,No,Basic,Bachelor,2/3/11,Employed,M,Rural,Single,Personal Auto,Agent,Four-Door Car


In [11]:
use_one_hot = categorical[['state', 'marital_status','policy_type','sales_channel','vehicle_class']]

In [12]:
use_ordinal = categorical[['response','coverage','education','effective_to_date','employment_status','gender','location_code']]

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(use_one_hot)
encoded_df = pd.DataFrame(encoded.toarray(), columns=encoder.get_feature_names_out(input_features=use_one_hot.columns))
encoded_df2 = encoded_df.reset_index(drop=True)
encoded_df2.head()

Unnamed: 0,state_Arizona,state_California,state_Nevada,state_Oregon,state_Washington,marital_status_Divorced,marital_status_Married,marital_status_Single,policy_type_Corporate Auto,policy_type_Personal Auto,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
use_ordinal.head()

Unnamed: 0_level_0,response,coverage,education,effective_to_date,employment_status,gender,location_code
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BU79786,No,Basic,Bachelor,2/24/11,Employed,F,Suburban
QZ44356,No,Extended,Bachelor,1/31/11,Unemployed,F,Suburban
AI49188,No,Premium,Bachelor,2/19/11,Employed,F,Suburban
WW63253,No,Basic,Bachelor,1/20/11,Unemployed,M,Suburban
HB64268,No,Basic,Bachelor,2/3/11,Employed,M,Rural


In [16]:
use_ordinal['coverage'] = use_ordinal['coverage'].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use_ordinal['coverage'] = use_ordinal['coverage'].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})


In [17]:
use_ordinal['response'] = use_ordinal['response'].map({"No" : 0, "Yes" : 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use_ordinal['response'] = use_ordinal['response'].map({"No" : 0, "Yes" : 1})


In [18]:
use_ordinal['gender'] = use_ordinal['gender'].map({"F" : 1, "M" : 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use_ordinal['gender'] = use_ordinal['gender'].map({"F" : 1, "M" : 0})


In [19]:
columns_to_encode = ['education', 'employment_status', 'location_code']
encoded_df = pd.get_dummies(use_ordinal[columns_to_encode], prefix=columns_to_encode)
use_ordinal.drop(columns_to_encode, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  use_ordinal.drop(columns_to_encode, axis=1, inplace=True)


In [20]:
use_ordinal = pd.concat([use_ordinal, encoded_df], axis=1)

In [21]:
use_ordinal

Unnamed: 0_level_0,response,coverage,effective_to_date,gender,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employment_status_Disabled,employment_status_Employed,employment_status_Medical Leave,employment_status_Retired,employment_status_Unemployed,location_code_Rural,location_code_Suburban,location_code_Urban
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
BU79786,0,0,2/24/11,1,1,0,0,0,0,0,1,0,0,0,0,1,0
QZ44356,0,1,1/31/11,1,1,0,0,0,0,0,0,0,0,1,0,1,0
AI49188,0,2,2/19/11,1,1,0,0,0,0,0,1,0,0,0,0,1,0
WW63253,0,0,1/20/11,0,1,0,0,0,0,0,0,0,0,1,0,1,0
HB64268,0,0,2/3/11,0,1,0,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA72316,0,0,2/10/11,0,1,0,0,0,0,0,1,0,0,0,0,0,1
PK87824,1,1,2/12/11,1,0,1,0,0,0,0,1,0,0,0,0,1,0
TD14365,0,1,2/6/11,0,1,0,0,0,0,0,0,0,0,1,0,1,0
UP19263,0,1,2/3/11,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [22]:
from pandas import concat
new_cat = concat(
    [
        use_ordinal.reset_index(drop=True),
        encoded_df2.reset_index(drop=True)
    ],
    axis=1,
    ignore_index=True,
)

concatenated_dataframes_columns = [
    list(use_ordinal.columns),
    list(encoded_df2.columns)
]
    
flatten = lambda nested_lists: [item for sublist in nested_lists for item in sublist]

new_cat.columns = flatten(concatenated_dataframes_columns)
new_cat.head()

Unnamed: 0,response,coverage,effective_to_date,gender,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employment_status_Disabled,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0,0,2/24/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,1/31/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,2/19/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,1/20/11,0,1,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0,2/3/11,0,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
categoric = new_cat
categoric.head()

Unnamed: 0,response,coverage,effective_to_date,gender,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employment_status_Disabled,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0,0,2/24/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,1/31/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,2/19/11,1,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,1/20/11,0,1,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0,2/3/11,0,1,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers 
# might be useful.

In [25]:
categoric['month'] = categoric['effective_to_date'].dt.month
categoric.head()

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
categoric = categoric.drop('effective_to_date', axis=1)
categoric.head()


Unnamed: 0,response,coverage,gender,education_Bachelor,education_College,education_Doctor,education_High School or Below,education_Master,employment_status_Disabled,employment_status_Employed,...,sales_channel_Agent,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Four-Door Car,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0,0,1,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,1,1,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,2,1,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0,0,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, 
# change it using encoding.

In [None]:
categoric.dtypes

response                             int64
coverage                             int64
gender                               int64
education_Bachelor                   uint8
education_College                    uint8
education_Doctor                     uint8
education_High School or Below       uint8
education_Master                     uint8
employment_status_Disabled           uint8
employment_status_Employed           uint8
employment_status_Medical Leave      uint8
employment_status_Retired            uint8
employment_status_Unemployed         uint8
location_code_Rural                  uint8
location_code_Suburban               uint8
location_code_Urban                  uint8
week                                 int64
month                                int64
state_Arizona                      float64
state_California                   float64
state_Nevada                       float64
state_Oregon                       float64
state_Washington                   float64
marital_sta

In [None]:
categoric.to_csv('categoric.csv', index=True)