#### 1. Import libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#### 2. Load the dataframe

In [None]:
customer_df = pd.read_csv('we_fn_use_c_marketing_customer_value_analysis.csv')

#### 3. Look at the main features

In [None]:
display(customer_df.head())
display(customer_df.shape)
display(customer_df.info())

### Data Cleaning

#### 4. Rename the columns so they follow the PE8 (snake case (lower_case_with_underscores

In [None]:
customer_df.columns = [colname.lower().replace(' ','_') for colname in customer_df.columns]
display(customer_df.head(50))
pd.set_option('display.max_columns', 500)

#### 5. Change effective to date column to datetime format.

In [None]:
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])
customer_df['effective_to_date']

#### 6. Define a function that differentiates between continuous and discrete variable

In [None]:
numericals = customer_df.select_dtypes(include = np.number)
display(numericals.nunique())

display(numericals.nunique()/len(numericals))
numericals

In [None]:
continous_name = []
discrete_name = []
for i in numericals:
    if numericals[i].nunique()/len(numericals) < 0.03:
        discrete_name.append(i)
    else:
        continous_name.append(i)

In [None]:
discrete_df = customer_df[discrete_name]
display(discrete_df)

continous_df = customer_df[continous_name]
display(continous_df)

In [None]:
# # Can also be written in a function
# def cont_disc(df):
    
#     continous_name = []
#     discrete_name = []
    
#     for col in df:
#         if df[col].nunique()/len(df) < 0.03:
#             discrete_name.append(col)
#         else:
#             continous_name.append(col)
    
#   return discrete_name, continous_name   

# disc_col, cont_col = cont_disc(numericals)
# print(disc_col)
# cont_col

#### 7. Plot a correlation matrix

In [None]:
correlation_matrix = customer_df.corr().round(2)
correlation_matrix

fig, corr_heatmap=plt.subplots(figsize=(8,6))
corr_heatmap = sns.heatmap(correlation_matrix, annot=True)
plt.show()

- There is a higher correlation between "monthly premium auto" and "total_claim_amount" of 0.63
- As well as between "monthly_premium_auto" and "customer_lifetime_value" of 0.4
- There is no correlation at all between "number_of_policies" and "number_of_open_complaints"
- No correlation between income and months_since_policy_inception, months_since_policy_inception and number_of_open_complaints

#### 8. Create a function to plot every discrete variable. Do the same with the continuous variables

In [None]:
# for discrete values
for i in discrete_df:
    sns.displot(discrete_df[i])
    plt.show()

In [None]:
for i in continous_df:
    sns.displot(continous_df[i], bins = 30)
    plt.show()

#### 9. Comment what you can see in the plots

- Could think about turning the discrete values into objects
- Customer_lifetime_value has a big range of values, could transform into log to analyze better
- income has a lot of values that are close to 0. Should look at them more closely

#### 10. Look for outliers in the continuous variables

In [None]:
for i in continous_df:
    sns.boxplot(continous_df[i])
    plt.show()

#### 11. Did you find outliers? Comment what you will do with them.

In [None]:
# number of policies
extraordinary_points_income = customer_df[customer_df['income'] < 10] #from the boxplot we can see that anything above 200 should be removed instead of the traditional way seen above
print("The no of outliers for income is",len(extraordinary_points_income))

# total claim amount
extraordinary_points_claim = customer_df[customer_df['total_claim_amount'] >2500] #from the boxplot we can see that anything above 200 should be removed instead of the traditional way seen above
print("The no of outliers for total claim amount is",len(extraordinary_points_claim))

# total claim amount
extraordinary_points_clv = customer_df[customer_df['customer_lifetime_value'] >55000] #from the boxplot we can see that anything above 200 should be removed instead of the traditional way seen above
print("The no of outliers for customer lifetime value is",len(extraordinary_points_clv))

##### What will I do with the outliers?
- Keep the outliers for income as they are too many.
- Delete the outliers for total claim amount and customer lifetime value