# **Customer Analysis**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100
## Install xlrd package to load Excel files
#!conda install openpyxl
#!conda install xlrd

## Loading data

In [2]:
file1 = pd.read_csv('Data/Day 2/file1.csv')
file2 = pd.read_csv('Data/Day 2/file2.csv')
file3 = pd.read_csv('Data/Day 2/file3.csv')
    

FileNotFoundError: [Errno 2] No such file or directory: 'Data/file1.csv'

In [None]:
list(file1.columns)

In [None]:
file1.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file1.columns)
 

In [None]:
file2.rename(columns={'ST':'State','GENDER':'Gender'}, inplace = True)
list(file2.columns)

In [None]:
list(file3.columns)

In [None]:
df=pd.concat([file1,file2,file3], axis=0)
df

In [None]:
df=df.reset_index(drop=True)
df


In [None]:
df = df.drop_duplicates()

In [None]:
df

In [None]:
df=df.drop(columns="Customer")
df

In [None]:
df.info() #

Check :Customer Lifetime Value and Number of Open Complaints

In [None]:
def lower_case_column_names(df): #function to change columns to lower case
    df.columns=[i.lower() for i in df.columns] #list comprenhension
    return df

In [None]:
df=lower_case_column_names(df)
df

In [None]:
df["customer lifetime value"] = df["customer lifetime value"].apply(lambda x : (pd.to_numeric(x.replace('%',"")))/100 if type(x)==str else x)

In [None]:
df["customer lifetime value"]

In [None]:
df.info()

In [None]:
(df.isna().sum()/len(df))*100

In [None]:
df[df['customer lifetime value'].isna()==True]

In [None]:
df["customer lifetime value"] = pd.to_numeric(df["customer lifetime value"])

In [None]:
df.info()

In [None]:
df['number of open complaints'].value_counts()

In [None]:
df["number of open complaints"] = df["number of open complaints"].apply(lambda x: int(x[2]) if type(x)==str and len(x)>1 else x)

In [None]:
df['number of open complaints'].value_counts()

In [None]:
df.info()

## Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns

In [None]:
df['state'].value_counts()

In [None]:
def typos(x):
    if x in ['Cali']:
        return 'California'
    elif x in ['AZ']:
        return 'Arizona'
    elif x in ['WA']:
        return 'Washington'
    else:
        return x
    

In [None]:
df['state'] = list(map(typos, df['state']))

In [None]:
df['state'].value_counts()

In [None]:
df

## Replacing null values – Replace missing values with means of the column (for numerical columns). Pay attention that the Income feature for instance has 0s which is equivalent to null values. (We assume here that there is no such income with 0 as it refers to missing values)

In [None]:
df.info()

In [None]:
df[['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']].isna().sum()

In [None]:
def mean_replace(column):
    mean_value = np.mean(df[column]) #create a variable with the mean of that column
    df[column] = df[column].fillna(mean_value) #replacing the null values with the mean
    

In [None]:
mean_replace('customer lifetime value')
mean_replace('income')
mean_replace('monthly premium auto')
mean_replace('number of open complaints')
mean_replace('total claim amount')


In [None]:
# for col in num_lst:   #other way to do the above
   # mean_replace(col)

In [None]:
df[['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']].isna().sum()

In [None]:
df['income']= [x if x!=0 else None for x in df['income']]

In [None]:
mean_value = np.mean(df['income'])
df['income'] = df['income'].fillna(mean_value)

In [None]:
df

In [None]:
df['number of open complaints']= [ int(x) for x in df['number of open complaints']]

In [None]:
df.info()

## Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [None]:
State_dic = {'California':'West Region','Oregon':'North West','Washington':'East','Arizona':'Central','Nevada':'Central'}
State_dic 

In [None]:
df['state']= df['state'].replace(State_dic)
df['state']
df

#def state_to_zone(state):                 ## Clara's version
   # if state==“California”: return “West”
    #elif state==“Oregon”: return “North West”
    #elif state==“Washington”: return “East”
    #elif state in [“Arizone”,“Nevada”]: return “Central”
#df.State = df[“State”].apply(state_to_zone)

## (Optional) In the column `Vehicle Class`, nerge the two categories `Luxury SUV` and `Luxury Car` into one category named `Luxury Vehicle

In [None]:
df['vehicle class'].value_counts()

In [None]:
df['vehicle class'] = df['vehicle class'].replace(['Luxury SUV','Luxury Car'],'Luxury Vehicle')


In [None]:
df['vehicle class'].value_counts()

## - (Optional) Removing outliers using 1.5*IQR technique for all numerical columns.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def out_iqr(column):
    df.sort_values(column)
    Q1 = df[column].quantile([.25])
    Q3 = df[column].quantile([.75])
    IQR = np.array(Q3) - np.array(Q1)
    lower_range = int((np.array(Q1)) - (1.5 * IQR))
    upper_range = int((np.array(Q3)) + (1.5 * IQR))
    return lower_range ,upper_range

In [None]:
out_iqr('income')

In [None]:
def outlier_treatment(col):
    sorted(col)
    Q1,Q3 = np.percentile(col, [25,75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    lower_range
    upper_range = Q3 + (1.5 * IQR)
    return lower_range,upper_range

In [None]:
outlier_treatment(df['income'])

In [360]:
numerical_columns = ['customer lifetime value','income','monthly premium auto','number of open complaints','total claim amount']
for col in numerical_columns:
    lowerbound,upperbound = out_iqr(col) # () same as function
    outliers = df[col][(df[col] < lowerbound)|(df[col]>upperbound)]
    print(col,":",len(outliers),"outliers")
    df[col].drop(outliers.index, inplace=True)


customer lifetime value : 817 outliers
income : 0 outliers
monthly premium auto : 443 outliers
number of open complaints : 1882 outliers
total claim amount : 447 outliers


In [364]:
df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,East,,Master,7977.832132,50508.694321,1000.0,0,Personal Auto,Four-Door Car,2.704934
1,Central,F,Bachelor,6979.535900,50508.694321,94.0,0,Personal Auto,Four-Door Car,1131.464935
2,Central,F,Bachelor,12887.431700,48767.000000,108.0,0,Personal Auto,Two-Door Car,566.472247
3,West Region,M,Bachelor,7645.861800,50508.694321,106.0,0,Corporate Auto,SUV,529.881344
4,East,M,High School or Below,5363.076500,36357.000000,68.0,0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,West Region,M,Bachelor,23405.987980,71941.000000,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,West Region,F,College,3096.511217,21604.000000,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,West Region,M,Bachelor,8163.890428,50508.694321,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,West Region,M,College,7524.442436,21941.000000,96.0,0,Personal Auto,Four-Door Car,691.200000


In [365]:
df['education'].value_counts()

Bachelor                2718
College                 2681
High School or Below    2616
Master                   751
Doctor                   344
Bachelors                 24
Name: education, dtype: int64

In [366]:
df['education'] = df['education'].replace('Bachelors','Bachelor')
df['education'].value_counts()

Bachelor                2742
College                 2681
High School or Below    2616
Master                   751
Doctor                   344
Name: education, dtype: int64