# **Customer Analysis**

In [300]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100
## Install xlrd package to load Excel files
#!conda install openpyxl
#!conda install xlrd

## Loading data

In [301]:
file1 = pd.read_csv('Data/file1.csv')
file2 = pd.read_csv('Data/file2.csv')
file3 = pd.read_csv('Data/file3.csv')
    

In [302]:
list(file1.columns)

['Customer',
 'ST',
 'GENDER',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Vehicle Class',
 'Total Claim Amount']

In [303]:
file1.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file1.columns)
 

['Customer',
 'State',
 'Gender',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Vehicle Class',
 'Total Claim Amount']

In [304]:
file2.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file2.columns)

['Customer',
 'State',
 'Gender',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Total Claim Amount',
 'Policy Type',
 'Vehicle Class']

In [305]:
list(file3.columns)

['Customer',
 'State',
 'Customer Lifetime Value',
 'Education',
 'Gender',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Total Claim Amount',
 'Vehicle Class']

In [306]:
df=pd.concat([file1,file2,file3], axis=0)
df

Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [307]:
df=df.reset_index(drop=True)
df


Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
12069,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [308]:
df = df.drop_duplicates()

In [309]:
df

Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
12069,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [310]:
df=df.drop(columns="Customer")
df

Unnamed: 0,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [311]:
df.info() #

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      9134 non-null   object 
 1   Gender                     9012 non-null   object 
 2   Education                  9134 non-null   object 
 3   Customer Lifetime Value    9127 non-null   object 
 4   Income                     9134 non-null   float64
 5   Monthly Premium Auto       9134 non-null   float64
 6   Number of Open Complaints  9134 non-null   object 
 7   Policy Type                9134 non-null   object 
 8   Vehicle Class              9134 non-null   object 
 9   Total Claim Amount         9134 non-null   float64
dtypes: float64(3), object(7)
memory usage: 785.0+ KB


Check :Customer Lifetime Value and Number of Open Complaints

In [312]:
def lower_case_column_names(df): #function to change columns to lower case
    df.columns=[i.lower() for i in df.columns] #list comprenhension
    return df

In [313]:
df=lower_case_column_names(df)
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [314]:
df["customer lifetime value"] = df["customer lifetime value"].apply(lambda x : (pd.to_numeric(x.replace('%',"")))/100 if type(x)==str else x)

In [315]:
df["customer lifetime value"]

0                 NaN
1         6979.535900
2        12887.431700
3         7645.861800
4         5363.076500
             ...     
12069    23405.987980
12070     3096.511217
12071     8163.890428
12072     7524.442436
12073     2611.836866
Name: customer lifetime value, Length: 9135, dtype: float64

In [316]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   object 
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(4), object(6)
memory usage: 785.0+ KB


In [317]:
(df.isna().sum()/len(df))*100

state                        0.010947
gender                       1.346470
education                    0.010947
customer lifetime value      0.087575
income                       0.010947
monthly premium auto         0.010947
number of open complaints    0.010947
policy type                  0.010947
vehicle class                0.010947
total claim amount           0.010947
dtype: float64

In [318]:
df[df['customer lifetime value'].isna()==True]

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
78,Washington,F,Master,,41275.0,96.0,1/0/00,Personal Auto,Four-Door Car,41.122303
988,Washington,M,High School or Below,,55561.0,63.0,1/0/00,Personal Auto,Four-Door Car,227.872071
1071,,,,,,,,,,
4333,Washington,M,High School or Below,,51878.0,66.0,1/1/00,Personal Auto,Four-Door Car,316.8
4380,Washington,,High School or Below,,36765.0,66.0,1/1/00,Personal Auto,Four-Door Car,320.849072
4588,Washington,M,Master,,0.0,70.0,1/0/00,Personal Auto,Four-Door Car,336.0
4869,Washington,F,Bachelors,,25859.0,74.0,1/0/00,Special Auto,Four-Door Car,355.2


In [319]:
df["customer lifetime value"] = pd.to_numeric(df["customer lifetime value"])

In [320]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   object 
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(4), object(6)
memory usage: 785.0+ KB


In [321]:
df['number of open complaints'].value_counts()

0         5629
1/0/00    1623
1          765
2          283
1/1/00     247
3          230
4          119
1/2/00      93
1/3/00      60
5           44
1/4/00      29
1/5/00      12
Name: number of open complaints, dtype: int64

In [322]:
df["number of open complaints"] = df["number of open complaints"].apply(lambda x: int(x[2]) if type(x)==str and len(x)>1 else x)

In [323]:
df['number of open complaints'].value_counts()

0.0    7252
1.0    1012
2.0     376
3.0     290
4.0     148
5.0      56
Name: number of open complaints, dtype: int64

In [324]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   float64
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


## Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns

In [325]:
df['state'].value_counts()

California    3030
Oregon        2601
Arizona       1629
Nevada         882
Washington     768
Cali           120
AZ              74
WA              30
Name: state, dtype: int64

In [326]:
def typos(x):
    if x in ['Cali']:
        return 'California'
    elif x in ['AZ']:
        return 'Arizona'
    elif x in ['WA']:
        return 'Washington'
    else:
        return x
    

In [327]:
df['state'] = list(map(typos, df['state']))

In [328]:
df['state'].value_counts()

California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     798
Name: state, dtype: int64

In [329]:
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,6979.535900,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,12887.431700,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,7645.861800,0.0,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,5363.076500,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.987980,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.200000


## Replacing null values – Replace missing values with means of the column (for numerical columns). Pay attention that the Income feature for instance has 0s which is equivalent to null values. (We assume here that there is no such income with 0 as it refers to missing values)

In [330]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   float64
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 785.0+ KB


In [331]:
df[['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']].isna().sum()

customer lifetime value      8
income                       1
monthly premium auto         1
number of open complaints    1
total claim amount           1
dtype: int64

In [332]:
def mean_replace(column):
    mean_value = np.mean(df[column]) #create a variable with the mean of that column
    df[column] = df[column].fillna(mean_value) #replacing the null values with the mean
    

In [333]:
mean_replace('customer lifetime value')
mean_replace('income')
mean_replace('monthly premium auto')
mean_replace('number of open complaints')
mean_replace('total claim amount')


In [334]:
# for col in num_lst:   #other way to do the above
   # mean_replace(col)

In [335]:
df[['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']].isna().sum()

customer lifetime value      0
income                       0
monthly premium auto         0
number of open complaints    0
total claim amount           0
dtype: int64

In [336]:
df['income']= [x if x!=0 else None for x in df['income']]

In [337]:
mean_value = np.mean(df['income'])
df['income'] = df['income'].fillna(mean_value)

In [338]:
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,7977.832132,50508.694321,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,6979.535900,50508.694321,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,12887.431700,48767.000000,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,7645.861800,50508.694321,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,5363.076500,36357.000000,68.0,0.0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.987980,71941.000000,73.0,0.0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.000000,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,50508.694321,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.000000,96.0,0.0,Personal Auto,Four-Door Car,691.200000


In [339]:
df['number of open complaints']= [ int(x) for x in df['number of open complaints']]

In [340]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9135 non-null   float64
 4   income                     9135 non-null   float64
 5   monthly premium auto       9135 non-null   float64
 6   number of open complaints  9135 non-null   int64  
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9135 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 785.0+ KB


## Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [341]:
State_dic = {'California':'West Region','Oregon':'North West','Washington':'East','Arizona':'Central','Nevada':'Central'}
State_dic 

{'California': 'West Region',
 'Oregon': 'North West',
 'Washington': 'East',
 'Arizona': 'Central',
 'Nevada': 'Central'}

In [342]:
df['state']= df['state'].replace(State_dic)
df['state']
df

#def state_to_zone(state):                 ## Clara's version
   # if state==“California”: return “West”
    #elif state==“Oregon”: return “North West”
    #elif state==“Washington”: return “East”
    #elif state in [“Arizone”,“Nevada”]: return “Central”
#df.State = df[“State”].apply(state_to_zone)

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,East,,Master,7977.832132,50508.694321,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Central,F,Bachelor,6979.535900,50508.694321,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Central,F,Bachelor,12887.431700,48767.000000,108.0,0,Personal Auto,Two-Door Car,566.472247
3,West Region,M,Bachelor,7645.861800,50508.694321,106.0,0,Corporate Auto,SUV,529.881344
4,East,M,High School or Below,5363.076500,36357.000000,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,West Region,M,Bachelor,23405.987980,71941.000000,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,West Region,F,College,3096.511217,21604.000000,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,West Region,M,Bachelor,8163.890428,50508.694321,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,West Region,M,College,7524.442436,21941.000000,96.0,0,Personal Auto,Four-Door Car,691.200000


## (Optional) In the column `Vehicle Class`, nerge the two categories `Luxury SUV` and `Luxury Car` into one category named `Luxury Vehicle

In [343]:
df['vehicle class'].value_counts()

Four-Door Car    4640
Two-Door Car     1895
SUV              1773
Sports Car        483
Luxury SUV        182
Luxury Car        161
Name: vehicle class, dtype: int64

In [344]:
df['vehicle class'] = df['vehicle class'].replace(['Luxury SUV','Luxury Car'],'Luxury Vehicle')


In [345]:
df['vehicle class'].value_counts()

Four-Door Car     4640
Two-Door Car      1895
SUV               1773
Sports Car         483
Luxury Vehicle     343
Name: vehicle class, dtype: int64

## - (Optional) Removing outliers using 1.5*IQR technique for all numerical columns.

In [346]:
import matplotlib.pyplot as plt
%matplotlib inline

In [356]:
def out_iqr(column):
    df.sort_values(column)
    Q1 = df[column].quantile([.25])
    Q3 = df[column].quantile([.75])
    IQR = np.array(Q3) - np.array(Q1)
    lower_range = int((np.array(Q1)) - (1.5 * IQR))
    upper_range = int((np.array(Q3)) + (1.5 * IQR))
    return lower_range ,upper_range

In [363]:
out_iqr('income')

(-7775, 104578)

In [358]:
def outlier_treatment(col):
    sorted(col)
    Q1,Q3 = np.percentile(col, [25,75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    lower_range
    upper_range = Q3 + (1.5 * IQR)
    return lower_range,upper_range

In [359]:
outlier_treatment(df['income'])

(-7775.25, 104578.75)

In [360]:
numerical_columns = ['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']
for col in numerical_columns:
    lowerbound,upperbound = out_iqr(col) # () same as function
    outliers = df[col][(df[col] < lowerbound)|(df[col]>upperbound)]
    print(col,":",len(outliers),"outliers")
    df[col].drop(outliers.index, inplace=True)


customer lifetime value : 817 outliers
income : 0 outliers
monthly premium auto : 443 outliers
number of open complaints : 1882 outliers
total claim amount : 447 outliers


In [364]:
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,East,,Master,7977.832132,50508.694321,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Central,F,Bachelor,6979.535900,50508.694321,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Central,F,Bachelor,12887.431700,48767.000000,108.0,0,Personal Auto,Two-Door Car,566.472247
3,West Region,M,Bachelor,7645.861800,50508.694321,106.0,0,Corporate Auto,SUV,529.881344
4,East,M,High School or Below,5363.076500,36357.000000,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,West Region,M,Bachelor,23405.987980,71941.000000,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,West Region,F,College,3096.511217,21604.000000,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,West Region,M,Bachelor,8163.890428,50508.694321,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,West Region,M,College,7524.442436,21941.000000,96.0,0,Personal Auto,Four-Door Car,691.200000


In [365]:
df['education'].value_counts()

Bachelor                2718
College                 2681
High School or Below    2616
Master                   751
Doctor                   344
Bachelors                 24
Name: education, dtype: int64

In [366]:
df['education'] = df['education'].replace('Bachelors','Bachelor')
df['education'].value_counts()

Bachelor                2742
College                 2681
High School or Below    2616
Master                   751
Doctor                   344
Name: education, dtype: int64