# This is the demo on classifying for homesite customer analysis

`
the target is to predict how possible a customer will buy the insurance quote provided by homesite
`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gc

In [None]:
# for text minning API from keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,Dropout,LSTM,Bidirectional,Embedding,PReLU,MaxPooling1D,Conv1D
from keras.layers import GlobalMaxPool1D,Dense,Flatten
from keras.layers import Conv1D, MaxPooling1D,concatenate
from keras.models import Model
from keras.optimizers import Adam
import keras.backend as K
from keras_tqdm import TQDMNotebookCallback

In [None]:
train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')
print(train.shape)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
null_data = train.isnull().sum()/len(train)
# null_data = null_data.sort_values(ascending=False)
null_data.sort_values(ascending=False,inplace=True)
null_data

In [None]:
#fill missing value for each cols with missing data
cols_with_missing = null_data.loc[null_data.values>0]
cols_with_missing.index

In [None]:
train['PropertyField29'] = train['PropertyField29'].fillna(-1)
train['PersonalField84'] = train['PersonalField84'].fillna(-1)
train['PropertyField38'] = train['PropertyField38'].fillna('unknown')
train['PropertyField36'] = train['PropertyField36'].fillna('unknown')
train['PersonalField7'] = train['PropertyField7'].fillna('unknown')
train['PropertyField3'] = train['PropertyField3'].fillna('unknown')
train['PropertyField34'] = train['PropertyField34'].fillna('unknown')
train['PropertyField32'] = train['PropertyField32'].fillna('unknown')
train['PropertyField4'] = train['PropertyField4'].fillna('unknown')


In [None]:
train['QuoteConversion_Flag'].sum()/len(train)

In [None]:
train.loc[train.QuoteConversion_Flag==1].shape[0]

In [None]:
#deal with time data
train['Original_Quote_Date'] = pd.to_datetime(train['Original_Quote_Date'])
train['year'] = train['Original_Quote_Date'].dt.year
train['month'] = train['Original_Quote_Date'].dt.month
train['day'] = train['Original_Quote_Date'].dt.day
train['dayofweek'] = train['Original_Quote_Date'].dt.weekday
# train_y = train['QuoteConversion_Flag'].values
# train.drop(['QuoteConversion_Flag','Original_Quote_Date','QuoteNumber'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.drop('Original_Quote_Date',axis=1,inplace=True)

In [None]:
for col in train.columns:
    print(col,train[col].dtypes,train[col].nunique())

In [None]:
cat_cols = [f for f in train.columns if train[f].dtypes=='object']
num_cols = [f for f in train.columns if 'float' in str(train[f].dtypes)]
others = [f for f in train.columns if (f not in cat_cols) and (f not in num_cols)]
print(len(cat_cols),len(num_cols),len(others),len(train.columns.values))

In [None]:
cat_cols

In [None]:
for col in cat_cols:
    x = pd.get_dummies(train[col],prefix=col)
    train = pd.concat([train,x],axis=1)
    train.drop(col,axis=1,inplace=True)

In [None]:
print(train.shape)
train.head()

In [None]:
train.drop('QuoteNumber',axis=1,inplace=True)

In [None]:
train,valid = train_test_split(train,shuffle=True,random_state = 788,test_size = 0.2)

In [None]:
train.shape, valid.shape

In [None]:
train_y = train['QuoteConversion_Flag']
train_X = train.drop('QuoteConversion_Flag',axis=1)

valid_y = valid['QuoteConversion_Flag']
valid_X = valid.drop('QuoteConversion_Flag',axis=1)

In [None]:
train_X.head()

In [None]:
model = RandomForestClassifier(n_estimators=200,max_depth=10,random_state=2019,verbose=2) #benchmark
model.fit(train_X,train_y)

In [None]:
pred_y = model.predict(valid_X)

In [None]:
pred_y[:20]

In [None]:
valid_y[:20].values

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(valid_y,pred_y)