In [None]:
'''# Lab | Data cleaning and wrangling

For this lab, we will be using the same dataset we used in the previous labs. 
We recommend using the same notebook since you will be reusing the same variables you previous created and used in labs. 

### Instructions

So far we have worked on `EDA`. This lab will focus on data cleaning and wrangling from everything we noticed before.

1. We will start with removing outliers. So far, we have discussed different methods to remove outliers. 
Use the one you feel more comfortable with, define a function for that. 
Use the function to remove the outliers and apply it to the dataframe.
2. Create a copy of the dataframe for the data wrangling.
3. Normalize the continuous variables. You can use any one method you want.
4. Encode the categorical variables
5. The time variable can be useful. 
Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.
6. Since the model will only accept numerical data, check and make sure that every column is numerical, 
if some are not, change it using encoding.

**Hint for Categorical Variables**

- You should deal with the categorical variables as shown below (for ordinal encoding, dummy code has been provided as well):

```python
# One hot to state
# Ordinal to coverage
# Ordinal to employmentstatus
# Ordinal to location code
# One hot to marital status
# One hot to policy type
# One hot to policy
# One hot to renew offercustomer_df
# One hot to sales channel
# One hot vehicle class
# Ordinal vehicle size

data["coverage"] = data["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
# given that column "coverage" in the dataframe "data" has three categories:
# "basic", "extended", and "premium" and values are to be represented in the same order.
```'''

In [118]:
import imblearn

# These are the normal libraries
import pandas as pd
import numpy as np

# This is just so that we don't get annoying warnings
import warnings
warnings.filterwarnings('ignore')

# This is the most common viz library in python
import matplotlib.pyplot as plt
%matplotlib inline

# This one is the above on steroids
import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# These Libs are for stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm

In [119]:
df = pd.read_csv(r"C:\Users\filip\OneDrive\Desktop\IRONHACK\Labs\Week5\lab-cleaning-categorical-data\files_for_lab\we_fn_use_c_marketing_customer_value_analysis.csv")

cols = []
for i in range(len(df.columns)):
    cols.append(df.columns[i].lower().replace(' ','_'))
df.columns = cols

df.dtypes # Checking data types

categorical_df = df.select_dtypes(include=['object']) #Saving categorical columns into df

categorical_df.isna().sum() #Checking for null values/No null values

# Checking unique values in each column
categorical_df['customer'].unique() #Categorical
categorical_df['state'].unique() #Categorical
categorical_df['response'].unique() #Categorical
categorical_df['coverage'].unique() #Categorical
categorical_df['education'].unique() #Categorical
categorical_df['effective_to_date'].unique() #Date Time
categorical_df['employmentstatus'].unique() #Categorical
categorical_df['gender'].unique() #Categorical
categorical_df['location_code'].unique() #Categorical
categorical_df['marital_status'].unique() #Categorical
categorical_df['policy_type'].unique() #Categorical
categorical_df['policy'].unique() #Categorical
categorical_df['renew_offer_type'].unique() #Categorical
categorical_df['sales_channel'].unique() #Categorical
categorical_df['vehicle_class'].unique() #Categorical
categorical_df['vehicle_size'].unique() #Categorical

# will drop effective_to_date and later if needed will convert to datetime on the original df

categorical_df = categorical_df.drop(['effective_to_date'], axis = 1)

# will also drop customer because it doesn't really give us any information

categorical_df = categorical_df.drop(['customer'], axis = 1)

categorical_df['policy_type'].unique() #Categorical

'''policy_type has the following unique values:['Corporate Auto', 'Personal Auto', 'Special Auto']

   policy has the following unique values:['Corporate L3', 'Personal L3', 'Corporate L2', 'Personal L1',
    'Special L2', 'Corporate L1', 'Personal L2', 'Special L1','Special L3']
    
It appears that the values in policy are the same as in policy_type but broken down into subtypes, more detailed information.

I would drop policy if in need to drop one of the two''' 

# Variables I would chose to hot encode
categorical_df['response'].unique()
categorical_df['coverage'].unique()
categorical_df['gender'].unique()
categorical_df['location_code'].unique()
categorical_df['marital_status'].unique()
categorical_df['policy_type'].unique()
categorical_df['vehicle_size'].unique()

# In my opinion these columns have too many values to hot encode, will drop for now
categorical_df['state'].unique()
categorical_df['education'].unique()
categorical_df['employmentstatus'].unique()
categorical_df['policy'].unique()
categorical_df['renew_offer_type'].unique()
categorical_df['sales_channel'].unique()
categorical_df['vehicle_class'].unique()



array(['Two-Door Car', 'Four-Door Car', 'SUV', 'Luxury SUV', 'Sports Car',
       'Luxury Car'], dtype=object)

In [120]:
customer_df = pd.read_csv(r"C:\Users\filip\OneDrive\Desktop\IRONHACK\Labs\Week5\lab-cleaning-numerical-data\files_for_lab\we_fn_use_c_marketing_customer_value_analysis.csv")
customer_df.head()
customer_df.shape
customer_df.dtypes

# Renaming columns
cols = []
for i in range(len(customer_df.columns)): 
    cols.append(customer_df.columns[i].lower().replace(' ', '_')) 
customer_df.columns = cols

# Changing effective to date column to datetime format

customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])

# Creating numerical data frame

numerical_df = customer_df.select_dtypes(exclude=['object'])

# Creating a function to differentiate betweens continuous and discrete variables

def decision(data, threshold=250):
    continuous_cols = []
    discrete_cols = []
    for column in data.columns:
        unique_count = data[column].nunique()
        if unique_count <= threshold:
            discrete_cols.append(column)
        else:
            continuous_cols.append(column)
    return continuous_cols, discrete_cols


continuous_columns, discrete_columns = decision(numerical_df)
print("Continuous columns:")
print(continuous_columns)
print("\nDiscrete columns:")
print(discrete_columns)
    
continuous_df = numerical_df[continuous_columns]
discrete_df = numerical_df[discrete_columns]


Continuous columns:
['customer_lifetime_value', 'income', 'total_claim_amount']

Discrete columns:
['effective_to_date', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies']


In [121]:
# Dropping customer lifetime value due to its high correlation to monthly premium auto
# This is affecting the target total_claim_amount
# There is a high correlation betweeen total claim amount and monthly premium auto and modetate with income

numerical_df = numerical_df.drop(['customer_lifetime_value'], axis = 1)

In [122]:
#Concating both dfs

df = pd.concat([numerical_df, categorical_df], axis = 1)

# Dropping outliers

def outliers_drop(data, columns, threshold=1.5):
    df_c = df.copy()
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        df_c = df_c[(df_c[column] >= lower_bound) & (df_c[column] <= upper_bound)]

    return df_c


clean_df = outliers_drop(df, ['total_claim_amount', 'income'], threshold=1.5).reset_index(drop = True)

# Copying df 

df_clean = clean_df.copy()

# Normalize continuous variables

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_continuous = scaler.fit_transform(continuous_df)
normalized_continuous = pd.DataFrame(normalized_continuous, columns=continuous_df.columns)

#Encode the categorical variables

cat_to_encode = categorical_df[['response', 'education', 'gender', 'state', 'marital_status', 'policy_type', 'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class']]

cat_encoded = pd.get_dummies(cat_to_encode, drop_first=True)

# Leaving others as ordinal

cat_ordinal = categorical_df[['coverage', 'employmentstatus', 'location_code','vehicle_size']]


coverage_ordinal = {'Basic': 1, 'Extended': 2, 'Premium': 3}

cat_ordinal['coverage'] = cat_ordinal['coverage'].map(coverage_ordinal)



employmentstatus_ordinal = {'Employed': 1, 'Unemployed': 2, 'Medical Leave': 3, 'Disabled': 4, 'Retired': 5}

cat_ordinal['employmentstatus'] = cat_ordinal['employmentstatus'].map(employmentstatus_ordinal)



location_code_ordinal = {'Suburban': 1, 'Rural': 2, 'Urban': 3}

cat_ordinal['location_code'] = cat_ordinal['location_code'].map(location_code_ordinal)



vehicle_size_ordinal = {'Small': 1, 'Medsize': 2, 'Large': 3}

cat_ordinal['vehicle_size'] = cat_ordinal['vehicle_size'].map(vehicle_size_ordinal)

# converting to numeric

cat_ordinal['coverage'] = pd.to_numeric(cat_ordinal['coverage'])
cat_ordinal['employmentstatus'] = pd.to_numeric(cat_ordinal['employmentstatus'])
cat_ordinal['location_code'] = pd.to_numeric(cat_ordinal['location_code'])
cat_ordinal['vehicle_size'] = pd.to_numeric(cat_ordinal['vehicle_size'])

# The time variable can be useful. 
#Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

# effective_to_date already in date time

# Since the model will only accept numerical data, check and make sure that every column is numerical, 
#if some are not, change it using encoding.

# Will normalize discrete_df as well

discrete_df['effective_to_date'] = pd.to_datetime(discrete_df['effective_to_date']).apply(lambda x: x.timestamp())

scaler = MinMaxScaler()
discrete_df['effective_to_date'] = scaler.fit_transform(discrete_df[['effective_to_date']])


discrete_df = discrete_df.drop(['effective_to_date'], axis=1)
scaler = MinMaxScaler()
normalized_discrete = scaler.fit_transform(discrete_df)
normalized_discrete = pd.DataFrame(normalized_discrete, columns=discrete_df.columns)


# Will concat all the data

final_df = pd.concat([normalized_continuous, cat_ordinal, normalized_discrete, cat_encoded], axis = 1)

In [123]:
final_df

Unnamed: 0,customer_lifetime_value,income,total_claim_amount,coverage,employmentstatus,location_code,vehicle_size,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,...,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car
0,0.010629,0.562847,0.132974,1,1,1,2,0.033755,0.914286,0.050505,...,0,0,0,0,0,0,0,0,0,1
1,0.062406,0.000000,0.391051,2,2,1,2,0.139241,0.371429,0.424242,...,1,0,0,0,0,0,0,0,0,0
2,0.134960,0.487763,0.195764,3,1,1,2,0.198312,0.514286,0.383838,...,0,0,0,0,0,0,0,0,0,1
3,0.070589,0.000000,0.183117,1,2,1,2,0.189873,0.514286,0.656566,...,0,0,0,1,0,0,0,1,0,0
4,0.011245,0.438443,0.047710,1,1,2,2,0.050633,0.342857,0.444444,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,0.264137,0.719547,0.068485,1,1,3,2,0.050633,0.514286,0.898990,...,0,0,0,0,1,0,0,0,0,0
9130,0.014719,0.216081,0.131034,2,1,1,2,0.075949,0.400000,0.282828,...,0,0,1,0,0,0,0,0,0,0
9131,0.076951,0.000000,0.273297,2,2,1,2,0.101266,0.257143,0.373737,...,0,0,1,0,0,0,0,0,0,0
9132,0.069098,0.219452,0.238876,2,1,1,3,0.147679,0.971429,0.030303,...,1,0,1,0,0,0,0,0,0,0
