# 0.0 IMPORTS

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

## 0.1 Loading Dataset

In [2]:
df_raw = pd.read_csv('../data/raw/train.csv')
df_raw.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


## 0.2 Helper Functions

In [13]:
# reserved

# 1.0 DATA DESCRIPTION

In [3]:
df1 = df_raw.copy()

In [17]:
cols_new = [col.lower() for col in df1.columns]
df1.columns = cols_new

In [18]:
df1.columns

Index(['id', 'gender', 'age', 'driving_license', 'region_code',
       'previously_insured', 'vehicle_age', 'vehicle_damage', 'annual_premium',
       'policy_sales_channel', 'vintage', 'response'],
      dtype='object')

## 1.1 Data Dimension

In [19]:
display(f'Number of rows: {df1.shape[0]}')
display(f'Number of columns: {df1.shape[1]}')

'Number of rows: 381109'

'Number of columns: 12'

## 1.2 Data Types

In [20]:
df1.dtypes

id                        int64
gender                   object
age                       int64
driving_license           int64
region_code             float64
previously_insured        int64
vehicle_age              object
vehicle_damage           object
annual_premium          float64
policy_sales_channel    float64
vintage                   int64
response                  int64
dtype: object

## 1.3 Check NA

In [25]:
df1.info()
# df1.isna().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   gender                381109 non-null  object 
 2   age                   381109 non-null  int64  
 3   driving_license       381109 non-null  int64  
 4   region_code           381109 non-null  float64
 5   previously_insured    381109 non-null  int64  
 6   vehicle_age           381109 non-null  object 
 7   vehicle_damage        381109 non-null  object 
 8   annual_premium        381109 non-null  float64
 9   policy_sales_channel  381109 non-null  float64
 10  vintage               381109 non-null  int64  
 11  response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


## 1.4 Data Descriptive

In [26]:
# Separete numerical and categorical features
num_attributes = df1.select_dtypes(include='number')
cat_attributes = df1.select_dtypes(exclude='number')

In [30]:
print(num_attributes.shape[1])
print(cat_attributes.shape[1])

9
3


In [79]:
# Central Tendency - Mean, Median
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(min)).T
d3 = pd.DataFrame(num_attributes.apply(max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

# Concat
describe_matrix = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
describe_matrix.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
describe_matrix

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1.0,381109.0,381108.0,190555.0,190555.0,110016.69187,9.443274e-16,-1.2
1,age,20.0,85.0,65.0,38.822584,36.0,15.511591,0.672539,-0.565655
2,driving_license,0.0,1.0,1.0,0.997869,1.0,0.046109,-21.59518,464.354302
3,region_code,0.0,52.0,52.0,26.388807,28.0,13.229871,-0.1152664,-0.867857
4,previously_insured,0.0,1.0,1.0,0.45821,0.0,0.498251,0.1677471,-1.971871
5,annual_premium,2630.0,540165.0,537535.0,30564.389581,31669.0,17213.132474,1.766087,34.004569
6,policy_sales_channel,1.0,163.0,162.0,112.034295,133.0,54.203924,-0.9000081,-0.97081
7,vintage,10.0,299.0,289.0,154.347397,154.0,83.671194,0.003029517,-1.200688
8,response,0.0,1.0,1.0,0.122563,0.0,0.327935,2.301906,3.298788


# 2.0 FEATURE ENGINEERING

In [80]:
df2 = df1.copy()

In [81]:
df2.head()

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


Preprocess 'vehicle_age' (eliminate '<' and '>' signals) and 'vehicle_damage'.

In [82]:
df2.vehicle_age.unique()

array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object)

We have 3 categories: over 2 years, between 1-2 years and below 1 year.

In [83]:
df2['vehicle_age'] = df2['vehicle_age'].apply(lambda x: 'over_2_years' if x == '> 2 Years' else 'between_1-2_years' if x == '1-2 Year'
                                                                                           else 'below_1_year')
df2['vehicle_damage'] = df2['vehicle_damage'].replace(['Yes', 'No'], [1, 0])

In [84]:
df2.head()

Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,1,Male,44,1,28.0,0,over_2_years,1,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,between_1-2_years,0,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,over_2_years,1,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,below_1_year,0,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,below_1_year,0,27496.0,152.0,39,0


# 3.0 DATA FILTERING

In [87]:
df3 = df2.copy()

We couldn't observe nothing odd within our describe section, so we're going to skip this section for now.

# 4.0 EXPLORATORY DATA ANALYSIS

In [88]:
df4 = df3.copy()

## 4.1. Univariate Analysis

In [89]:
df4.columns

Index(['id', 'gender', 'age', 'driving_license', 'region_code',
       'previously_insured', 'vehicle_age', 'vehicle_damage', 'annual_premium',
       'policy_sales_channel', 'vintage', 'response'],
      dtype='object')

### Gender

### Age

### Driving_license

### Region_code

### Previosuly_insured

### Vehicle_age

### Vehicle_damage

### Annual_premium

### Policy_sales_channel

### Vintage

### Response

## 4.2. Bivariate Analysis

## 4.3. Multivariate Analysis

# 5.0 DATA PREPARATION

In [7]:
df5 = df4.copy()

# 6.0 FEATURE SELECTION

In [8]:
df6 = df5.copy()

# 7.0 MODEL EVALUATION

In [9]:
df7 = df6.copy()

# 8.0 HYPERPARAMETER FINE-TUNING

In [10]:
df8 = df7.copy()

# 9.0 MODEL SELECTION

In [11]:
df9 = df8.copy()

# 10.0 PRODUCTION DEPLOYMENT 

In [12]:
df10 = df9.copy()