In [1]:
# Lab | Data cleaning and wrangling

In [2]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder

In [3]:
categoricals = pd.read_csv('categorical.csv')
categoricals

Unnamed: 0.1,Unnamed: 0,State,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size,policy_number,Year
0,0,Washington,No,Basic,Bachelor,2011-02-24,Employed,F,Suburban,Married,Corporate Auto,1,Agent,Two-Door Car,Medsize,3,2011
1,1,Arizona,No,Extended,Bachelor,2011-01-31,Unemployed,F,Suburban,Single,Personal Auto,3,Agent,Four-Door Car,Medsize,3,2011
2,2,Nevada,No,Premium,Bachelor,2011-02-19,Employed,F,Suburban,Married,Personal Auto,1,Agent,Two-Door Car,Medsize,3,2011
3,3,California,No,Basic,Bachelor,2011-01-20,Unemployed,M,Suburban,Married,Corporate Auto,1,Call Center,SUV,Medsize,2,2011
4,4,Washington,No,Basic,Bachelor,2011-02-03,Employed,M,Rural,Single,Personal Auto,1,Agent,Four-Door Car,Medsize,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,9129,California,No,Basic,Bachelor,2011-02-10,Employed,M,Urban,Married,Personal Auto,2,Web,Four-Door Car,Medsize,1,2011
9130,9130,California,Yes,Extended,College,2011-02-12,Employed,F,Suburban,Divorced,Corporate Auto,1,Branch,Four-Door Car,Medsize,3,2011
9131,9131,California,No,Extended,Bachelor,2011-02-06,Unemployed,M,Suburban,Single,Corporate Auto,1,Branch,Four-Door Car,Medsize,2,2011
9132,9132,California,No,Extended,College,2011-02-03,Employed,M,Suburban,Married,Personal Auto,3,Branch,Four-Door Car,Large,2,2011


In [4]:
numericals = pd.read_csv('numerical.csv')
numericals

Unnamed: 0.1,Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,0,2763.519279,56274,69,32,5,0,1,384.811147
1,1,6979.535903,0,94,13,42,0,8,1131.464935
2,2,12887.431650,48767,108,18,38,0,2,566.472247
3,3,7645.861827,0,106,18,65,0,7,529.881344
4,4,2813.692575,43836,73,12,44,0,1,138.130879
...,...,...,...,...,...,...,...,...,...
9129,9129,23405.987980,71941,73,18,89,0,2,198.234764
9130,9130,3096.511217,21604,79,14,28,0,1,379.200000
9131,9131,8163.890428,0,85,9,37,3,2,790.784983
9132,9132,7524.442436,21941,96,34,3,0,3,691.200000


In [5]:
# 1 - We will start with removing outliers. So far, we have discussed different methods to remove outliers. 
#Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and 
#apply it to the dataframe.

def remove_outliers(numericals):
    #Calculate IQR
    iqr = np.percentile(numericals,75) - np.percentile(numericals,25)
    
    upper_limit1 = np.percentile(numericals,75) + iqr*1.5
    lower_limit1 = np.percentile(numericals,25) - iqr*1.5
    
    #Remove outliers from data
    numericals = numericals[(numericals > lower_limit1) & (numericals < upper_limit1)] 
    
    return numericals

In [6]:
noutliers_numericals = remove_outliers(numericals)

In [7]:
# 2 - Create a copy of the dataframe for the data wrangling.

In [8]:
data2 = pd.concat([numericals, categoricals],axis = 1)

In [9]:
data2.drop(['Unnamed: 0'],axis=1)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,State,Response,...,Gender,Location Code,Marital Status,Policy Type,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size,policy_number,Year
0,2763.519279,56274,69,32,5,0,1,384.811147,Washington,No,...,F,Suburban,Married,Corporate Auto,1,Agent,Two-Door Car,Medsize,3,2011
1,6979.535903,0,94,13,42,0,8,1131.464935,Arizona,No,...,F,Suburban,Single,Personal Auto,3,Agent,Four-Door Car,Medsize,3,2011
2,12887.431650,48767,108,18,38,0,2,566.472247,Nevada,No,...,F,Suburban,Married,Personal Auto,1,Agent,Two-Door Car,Medsize,3,2011
3,7645.861827,0,106,18,65,0,7,529.881344,California,No,...,M,Suburban,Married,Corporate Auto,1,Call Center,SUV,Medsize,2,2011
4,2813.692575,43836,73,12,44,0,1,138.130879,Washington,No,...,M,Rural,Single,Personal Auto,1,Agent,Four-Door Car,Medsize,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,0,2,198.234764,California,No,...,M,Urban,Married,Personal Auto,2,Web,Four-Door Car,Medsize,1,2011
9130,3096.511217,21604,79,14,28,0,1,379.200000,California,Yes,...,F,Suburban,Divorced,Corporate Auto,1,Branch,Four-Door Car,Medsize,3,2011
9131,8163.890428,0,85,9,37,3,2,790.784983,California,No,...,M,Suburban,Single,Corporate Auto,1,Branch,Four-Door Car,Medsize,2,2011
9132,7524.442436,21941,96,34,3,0,3,691.200000,California,No,...,M,Suburban,Married,Personal Auto,3,Branch,Four-Door Car,Large,2,2011


In [10]:
numerical2 = data2.select_dtypes (include = np.number)

In [11]:
numerical2

Unnamed: 0.2,Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,Unnamed: 0.1,Renew Offer Type,policy_number,Year
0,0,2763.519279,56274,69,32,5,0,1,384.811147,0,1,3,2011
1,1,6979.535903,0,94,13,42,0,8,1131.464935,1,3,3,2011
2,2,12887.431650,48767,108,18,38,0,2,566.472247,2,1,3,2011
3,3,7645.861827,0,106,18,65,0,7,529.881344,3,1,2,2011
4,4,2813.692575,43836,73,12,44,0,1,138.130879,4,1,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,9129,23405.987980,71941,73,18,89,0,2,198.234764,9129,2,1,2011
9130,9130,3096.511217,21604,79,14,28,0,1,379.200000,9130,1,3,2011
9131,9131,8163.890428,0,85,9,37,3,2,790.784983,9131,1,2,2011
9132,9132,7524.442436,21941,96,34,3,0,3,691.200000,9132,3,2,2011


In [12]:
categorical2 = data2.select_dtypes(include = object)

In [17]:
numerical2.drop(['policy_number','Year', 'Renew Offer Type','customer_lifetime_value'],axis=1)._get_numeric_data()

Unnamed: 0.2,Unnamed: 0,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,Unnamed: 0.1
0,0,56274,69,32,5,0,1,384.811147,0
1,1,0,94,13,42,0,8,1131.464935,1
2,2,48767,108,18,38,0,2,566.472247,2
3,3,0,106,18,65,0,7,529.881344,3
4,4,43836,73,12,44,0,1,138.130879,4
...,...,...,...,...,...,...,...,...,...
9129,9129,71941,73,18,89,0,2,198.234764,9129
9130,9130,21604,79,14,28,0,1,379.200000,9130
9131,9131,0,85,9,37,3,2,790.784983,9131
9132,9132,21941,96,34,3,0,3,691.200000,9132


In [21]:
(numerical2.isna().sum()/len(numerical2)).sort_values(ascending=False)

Unnamed: 0                       0.0
customer_lifetime_value          0.0
income                           0.0
monthly_premium_auto             0.0
months_since_last_claim          0.0
months_since_policy_inception    0.0
number_of_open_complaints        0.0
number_of_policies               0.0
total_claim_amount               0.0
Unnamed: 0                       0.0
Renew Offer Type                 0.0
policy_number                    0.0
Year                             0.0
dtype: float64

In [14]:
# 3 - Normalize the continuous variables. You can use any one method you want.

In [63]:
 def differentiate_variables(DataFrame):
    continuous_cols = []
    discrete_cols = []
    
    for col in numerical2.columns:
        if (DataFrame[col].nunique() <= 20):
            discrete_cols.append(col)
        else:
            continuous_cols.append(col)
    continuous_df = DataFrame[continuous_cols]
    discrete_df = DataFrame[discrete_cols]
    
    return continuous_df, discrete_df 
   

In [64]:
continuous_df, discrete_df = differentiate_variables(numerical2)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
data2_continuous

In [None]:
X_num = data2_continuous.drop(['Unnamed: 0'],axis=1)._get_numeric_data()

In [None]:
data2_continuous.dtypes

In [None]:
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(X_num)
x_min_max = transformer.transform(X_num)
pd.DataFrame(x_min_max)

In [None]:
# 4 - Encode the categorical variables

In [None]:
one_hot = pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(categorical2)).transform(pd.DataFrame(categorical2)).toarray())

In [None]:
one_hot

In [None]:
# 5 - The time variable can be useful. Try to transform its data into a useful one. 
#Hint: Day week and month as integers might be useful.

In [None]:
numericals.dtypes


In [None]:
numerical2.dtypes

In [None]:
categoricals.dtypes

In [None]:
categorical2.dtypes