# **Customer Analysis**

In [381]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100
## Install xlrd package to load Excel files
#!conda install openpyxl
#!conda install xlrd

## Loading data

In [382]:
file1 = pd.read_csv('Data/file1.csv')
file2 = pd.read_csv('Data/file2.csv')
file3 = pd.read_csv('Data/file3.csv')
    

In [383]:
list(file1.columns)

['Customer',
 'ST',
 'GENDER',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Vehicle Class',
 'Total Claim Amount']

In [384]:
file1.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file1.columns)
 

['Customer',
 'State',
 'Gender',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Vehicle Class',
 'Total Claim Amount']

In [385]:
file2.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file2.columns)

['Customer',
 'State',
 'Gender',
 'Education',
 'Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Total Claim Amount',
 'Policy Type',
 'Vehicle Class']

In [386]:
list(file3.columns)

['Customer',
 'State',
 'Customer Lifetime Value',
 'Education',
 'Gender',
 'Income',
 'Monthly Premium Auto',
 'Number of Open Complaints',
 'Policy Type',
 'Total Claim Amount',
 'Vehicle Class']

In [387]:
df=pd.concat([file1,file2,file3], axis=0)
df

Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [388]:
df=df.reset_index(drop=True)
df


Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
12069,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [389]:
df = df.drop_duplicates()

In [390]:
df

Unnamed: 0,Customer,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
12069,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [391]:
df=df.drop(columns="Customer")
df

Unnamed: 0,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [392]:
df.info() #

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      9134 non-null   object 
 1   Gender                     9012 non-null   object 
 2   Education                  9134 non-null   object 
 3   Customer Lifetime Value    9127 non-null   object 
 4   Income                     9134 non-null   float64
 5   Monthly Premium Auto       9134 non-null   float64
 6   Number of Open Complaints  9134 non-null   object 
 7   Policy Type                9134 non-null   object 
 8   Vehicle Class              9134 non-null   object 
 9   Total Claim Amount         9134 non-null   float64
dtypes: float64(3), object(7)
memory usage: 785.0+ KB


Check :Customer Lifetime Value and Number of Open Complaints

In [393]:
def lower_case_column_names(df): #function to change columns to lower case
    df.columns=[i.lower() for i in df.columns] #list comprenhension
    return df

In [394]:
df=lower_case_column_names(df)
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [395]:
df["customer lifetime value"] = df["customer lifetime value"].apply(lambda x : x.replace('%',"") if type(x)==str else x)

In [396]:
df["customer lifetime value"]

0                NaN
1          697953.59
2         1288743.17
3          764586.18
4          536307.65
            ...     
12069    23405.98798
12070    3096.511217
12071    8163.890428
12072    7524.442436
12073    2611.836866
Name: customer lifetime value, Length: 9135, dtype: object

In [397]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9135 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9134 non-null   object 
 1   gender                     9012 non-null   object 
 2   education                  9134 non-null   object 
 3   customer lifetime value    9127 non-null   object 
 4   income                     9134 non-null   float64
 5   monthly premium auto       9134 non-null   float64
 6   number of open complaints  9134 non-null   object 
 7   policy type                9134 non-null   object 
 8   vehicle class              9134 non-null   object 
 9   total claim amount         9134 non-null   float64
dtypes: float64(3), object(7)
memory usage: 785.0+ KB


In [398]:
(df.isna().sum()/len(df))*100

state                        0.010947
gender                       1.346470
education                    0.010947
customer lifetime value      0.087575
income                       0.010947
monthly premium auto         0.010947
number of open complaints    0.010947
policy type                  0.010947
vehicle class                0.010947
total claim amount           0.010947
dtype: float64

In [399]:
df[df['customer lifetime value'].isna()==True]

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
78,Washington,F,Master,,41275.0,96.0,1/0/00,Personal Auto,Four-Door Car,41.122303
988,Washington,M,High School or Below,,55561.0,63.0,1/0/00,Personal Auto,Four-Door Car,227.872071
1071,,,,,,,,,,
4333,Washington,M,High School or Below,,51878.0,66.0,1/1/00,Personal Auto,Four-Door Car,316.8
4380,Washington,,High School or Below,,36765.0,66.0,1/1/00,Personal Auto,Four-Door Car,320.849072
4588,Washington,M,Master,,0.0,70.0,1/0/00,Personal Auto,Four-Door Car,336.0
4869,Washington,F,Bachelors,,25859.0,74.0,1/0/00,Special Auto,Four-Door Car,355.2


In [400]:
df=df.dropna(subset=["customer lifetime value"])
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
1,Arizona,F,Bachelor,697953.59,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
5,Oregon,F,Bachelor,825629.78,62902.0,69.0,1/0/00,Personal Auto,Two-Door Car,159.383042
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [401]:
df["customer lifetime value"] = pd.to_numeric(df["customer lifetime value"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["customer lifetime value"] = pd.to_numeric(df["customer lifetime value"])


In [402]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9127 entries, 1 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9127 non-null   object 
 1   gender                     9007 non-null   object 
 2   education                  9127 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9127 non-null   float64
 5   monthly premium auto       9127 non-null   float64
 6   number of open complaints  9127 non-null   object 
 7   policy type                9127 non-null   object 
 8   vehicle class              9127 non-null   object 
 9   total claim amount         9127 non-null   float64
dtypes: float64(4), object(6)
memory usage: 784.4+ KB


In [403]:
df['number of open complaints'].value_counts()

0         5629
1/0/00    1618
1          765
2          283
1/1/00     245
3          230
4          119
1/2/00      93
1/3/00      60
5           44
1/4/00      29
1/5/00      12
Name: number of open complaints, dtype: int64

In [404]:
df["number of open complaints"] = df["number of open complaints"].apply(lambda x: int(x[2]) if type(x)==str and len(x)>1 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["number of open complaints"] = df["number of open complaints"].apply(lambda x: int(x[2]) if type(x)==str and len(x)>1 else x)


In [405]:
df['number of open complaints'].value_counts()

0    7247
1    1010
2     376
3     290
4     148
5      56
Name: number of open complaints, dtype: int64

In [406]:
df["number of open complaints"] = pd.to_numeric(df["number of open complaints"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["number of open complaints"] = pd.to_numeric(df["number of open complaints"])


In [407]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9127 entries, 1 to 12073
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   state                      9127 non-null   object 
 1   gender                     9007 non-null   object 
 2   education                  9127 non-null   object 
 3   customer lifetime value    9127 non-null   float64
 4   income                     9127 non-null   float64
 5   monthly premium auto       9127 non-null   float64
 6   number of open complaints  9127 non-null   int64  
 7   policy type                9127 non-null   object 
 8   vehicle class              9127 non-null   object 
 9   total claim amount         9127 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 784.4+ KB


## Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns

In [408]:
df['state'].value_counts()

California    3030
Oregon        2601
Arizona       1629
Nevada         882
Washington     761
Cali           120
AZ              74
WA              30
Name: state, dtype: int64

In [410]:
def typos(x):
    if x in ['Cali']:
        return 'California'
    elif x in ['AZ']:
        return 'Arizona'
    elif x in ['WA']:
        return 'Washington'
    else:
        return x
    

In [411]:
df['state'] = list(map(typos, df['state']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['state'] = list(map(typos, df['state']))


In [412]:
df['state'].value_counts()

California    3150
Oregon        2601
Arizona       1703
Nevada         882
Washington     791
Name: state, dtype: int64