# Imports

In [136]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML
from IPython.display import Image
from scipy import stats
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# Helper functions

In [137]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


# load data

In [138]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [139]:
df4 = pd.read_csv('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/data/df3.csv')

In [140]:
df4.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response
0,1,Male,44,28.0,26.0,0,40454.0,217,> 2 Years,1,1
1,2,Male,76,3.0,26.0,0,33536.0,183,1-2 Year,0,0
2,3,Male,47,28.0,26.0,0,38294.0,27,> 2 Years,1,1
3,4,Male,21,11.0,152.0,1,28619.0,203,< 1 Year,0,0
4,5,Female,29,41.0,152.0,1,27496.0,39,< 1 Year,0,0


In [141]:
df4.dtypes

id                        int64
gender                   object
age                       int64
region_code             float64
policy_sales_channel    float64
previously_insured        int64
annual_premium          float64
vintage                   int64
vehicle_age              object
vehicle_damage            int64
response                  int64
dtype: object

# Train test split

In [142]:
train, val = train_test_split(df4,test_size=0.3,stratify=df4['response'],random_state=42)

In [143]:
val.to_csv('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/data/test_raw.csv',index=False)


In [144]:
df4 = train.copy()

# Rescaling

In [145]:
import pickle

In [146]:
se = StandardScaler()
mms_age = MinMaxScaler()
mms_vintage = MinMaxScaler()

# Min - max sacaler
df4['age'] = mms_age.fit_transform(df4[['age']].values)
pickle.dump( mms_age, open('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/features/age_scaler.pkl', 'wb' ) )

df4['vintage'] = mms_vintage.fit_transform(df4[['vintage']].values)
pickle.dump( mms_vintage, open('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/features/vintage_scaler.pkl', 'wb' ) )

# StandardScale
df4['annual_premium'] = se.fit_transform(df4[['annual_premium']].values)
pickle.dump( se, open('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/features/annual_premium_scaler.pkl', 'wb' ) )

In [147]:
# Min - max sacaler
val['age'] = mms_age.transform(val[['age']].values)

val['vintage'] = mms_vintage.transform(val[['vintage']].values)

# StandardScale
val['annual_premium'] = se.transform(val[['annual_premium']].values)


# Encoding

In [148]:
df4['gender'] = df4['gender'].apply(lambda x: 1 if x=='Female' else (0 if x=='Male' else x))

# vehicle_age ordinal scale
vehicle_age_dict = {'> 2 Years':3, '1-2 Year':2, '< 1 Year':1}
df4['vehicle_age'] = df4['vehicle_age'].map(vehicle_age_dict)

In [149]:
val['gender'] = val['gender'].apply(lambda x: 1 if x=='Female' else (0 if x=='Male' else x))

# vehicle_age ordinal scale
vehicle_age_dict = {'> 2 Years':3, '1-2 Year':2, '< 1 Year':1}
val['vehicle_age'] = val['vehicle_age'].map(vehicle_age_dict)

Para um próximo ciclo, region_code e policy_sales_channel  : target encoding ou frequency encodig

In [150]:
df4.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response
122122,122366,0,0.523077,28.0,26.0,0,0.095128,0.394464,2,1,0
355885,356644,1,0.046154,28.0,26.0,1,-1.622718,0.391003,1,0,0
267531,268103,0,0.046154,22.0,152.0,0,0.024423,0.384083,1,1,0
194775,195185,1,0.138462,24.0,26.0,1,0.039005,0.584775,2,0,0
13806,13836,1,0.107692,28.0,152.0,1,0.162406,0.238754,1,0,0


In [151]:
df4.shape

(266207, 11)

In [152]:
val.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,previously_insured,annual_premium,vintage,vehicle_age,vehicle_damage,response
195905,196319,0,0.461538,33.0,124.0,0,0.59727,0.49827,3,1,0
376569,377373,0,0.307692,8.0,124.0,1,-1.622718,0.467128,2,0,0
96494,96687,1,0.692308,41.0,109.0,0,-0.245443,0.816609,2,1,0
302880,303533,0,0.615385,28.0,124.0,0,1.287244,0.446367,2,1,0
255686,256233,0,0.246154,36.0,26.0,0,0.500537,0.217993,2,1,1


In [153]:
val.shape

(114090, 11)

In [154]:
#df4.to_csv('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/data/train.csv',index=False)
#val.to_csv('/content/drive/MyDrive/health_insurance_cross_sell-main/health_insurance_cross_sell/data/test.csv',index=False)