# 0. Imports

In [23]:
import pandas as pd
import numpy as np
import sys
import seaborn as sns

from pathlib import Path

from scipy import stats
from sklearn import preprocessing as pp

# 1. Data Preparation

In [24]:
# load data
path = str(Path.cwd().parents[0] / "data" / "processed")
path = path + '/' + 'df_eda.csv'

df = pd.read_csv(path)

In [25]:
df.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,driving_license,vehicle_age,vehicle_damage,previously_insured,annual_premium,vintage,response
0,7,Male,23,11,152,Yes,New,Yes,No,23367.0,249,No
1,13,Female,41,15,14,Yes,Average,No,Yes,31409.0,221,No
2,18,Female,25,35,152,Yes,New,No,Yes,46622.0,299,No
3,31,Female,26,8,160,Yes,New,No,No,2630.0,136,No
4,39,Male,45,8,124,Yes,Average,Yes,No,42297.0,264,No


In [26]:
df.columns

Index(['id', 'gender', 'age', 'region_code', 'policy_sales_channel',
       'driving_license', 'vehicle_age', 'vehicle_damage',
       'previously_insured', 'annual_premium', 'vintage', 'response'],
      dtype='object')

## Standardization
Only attributes with Normal distribution.

In [27]:
# annual_premium (we will not remove outliers now)

ss = pp.StandardScaler()

df['annual_premium'] = ss.fit_transform(df[['annual_premium']].values) # encoder receives an array

## Rescaling

In [28]:
# saves scaler to the same variable which will be used in the test
mms_age = pp.MinMaxScaler() 
mms_vintage = pp.MinMaxScaler()

# age
df['age'] = mms_age.fit_transform(df[['age']].values)

# vintage
df['vintage'] = mms_vintage.fit_transform(df[['vintage']].values)

## Encoding
Only categorical attributes which we want to mantain its nature (eg. binary, ordinal)

In [29]:
df['response'] = df['response'].apply(lambda x: 1 if x == 'Yes' else 0)

In [31]:
# region_code - Frequency Encoding | Target Encoding (we choose this) | Weigthed Target Encoding
target_encode_region_code = df.groupby('region_code')['response'].mean()
df.loc[:,'region_code'] = df['region_code'].map(target_encode_region_code) # replace all values within policy_sales_channel column with encoded values
df[['region_code']].head(10)

Unnamed: 0,region_code
0,0.11276
1,0.071987
2,0.124604
3,0.096142
4,0.096142
5,0.187163
6,0.1236
7,0.187163
8,0.071987
9,0.123362


In [18]:
# driving_license - Label Encoding
# vehicle_damage - Label Encoding
# previously_insured - Label Encoding

# vehicle_age - One Hot Encoding (we choose this) | Order Encoding
df = pd.get_dummies(df, prefix = 'vehicle_age', columns = ['vehicle_age'])

# policy_sales_channel - Target Encoding | Frequency Encoding (we choose this)
# Frequency Encoding gives more weight to more frequent values of the categorical attribute
fe_policy_sales_channel = df.groupby('policy_sales_channel').size() / len(df)
df.loc[:,'policy_sales_channel'] = df['policy_sales_channel'].map(fe_policy_sales_channel) # replace all values within policy_sales_channel column with encoded values
