In [17]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_train = pd.read_csv('../data/train.csv')
raw_test = pd.read_csv('../data/test.csv')
ss = pd.read_csv('../data/sample_submission.csv')

In [3]:
print(raw_train.info())
raw_train.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB
None


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1


In [4]:
print(raw_test.shape)
raw_test.head()

(127037, 11)


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297


In [7]:
df = raw_train.append(raw_test).reset_index(drop=True)
df.shape

(508146, 12)

In [8]:
df.drop(['id', 'Driving_License'], axis=1, inplace=True)

In [11]:
vehicle_age_map = {
    '< 1 Year': 0,
    '1-2 Year': 1,
    '> 2 Years': 2,
}

df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map)

In [13]:
for col in ['Gender', 'Vehicle_Damage']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [15]:
df[['Region_Code', 'Policy_Sales_Channel']] = df[['Region_Code', 'Policy_Sales_Channel']].astype(int)

In [22]:
pt = PowerTransformer()
df['Annual_Premium'] = pt.fit_transform(df[['Annual_Premium']]).ravel()

In [24]:
df['age < 46'] = df['Age'].apply(lambda x: 1 if x <= 46 else 0)

In [25]:
df.head()

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,age < 46
0,1,44,28,0,2,1,0.613041,26,217,1.0,1
1,1,76,3,0,1,0,0.242264,26,183,0.0,0
2,1,47,28,0,2,1,0.498947,26,27,1.0,0
3,1,21,11,1,0,0,-0.031897,152,203,0.0,1
4,0,29,41,1,0,0,-0.095969,152,39,0.0,1


In [26]:
target_col = 'Response'
mask = df[target_col].isna() == True

train = df.loc[~mask]
train[target_col] = train[target_col].astype(int)

test = df.loc[mask].drop(target_col, axis=1)

In [27]:
train.to_csv('../data/preprocessed_train1.csv', index=False)
test.to_csv('../data/preprocessed_test1.csv', index=False)