## import libraries

In [1]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# get prudential & test csv files as a DataFrame
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# preview the data
train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,2,1,D3,10,0.076923,2,1,1,0.641791,0.581818,...,0,0,0,0,0,0,0,0,0,8
1,5,1,A1,26,0.076923,2,3,1,0.059701,0.6,...,0,0,0,0,0,0,0,0,0,4
2,6,1,E1,26,0.076923,2,3,1,0.029851,0.745455,...,0,0,0,0,0,0,0,0,0,8
3,7,1,D4,10,0.487179,2,3,1,0.164179,0.672727,...,0,0,0,0,0,0,0,0,0,8
4,8,1,D2,26,0.230769,2,3,1,0.41791,0.654545,...,0,0,0,0,0,0,0,0,0,8


In [3]:
#append the two datasets

all_data = train.append(test,sort = True)

## TIDY UP THE RESPONSE VARIABLE

In [4]:
#change the data type of Response

all_data['Response'].fillna(-1,inplace = True)

all_data['Response'] = all_data['Response'].astype(int)

In [5]:
all_data['Response'].isnull().sum()

0

In [6]:
#summary of the response variable

all_data['Response'].value_counts(),all_data['Response'].isnull().sum()

(-1    19765
  8    19489
  6    11233
  7     8027
  2     6552
  1     6207
  5     5432
  4     1428
  3     1013
 Name: Response, dtype: int64, 0)

In [7]:
#remove the ID column

all_data.set_index('Id',inplace = True)

## WHAT DATA TYPES DO WE HAVE?

In [14]:
data_types = all_data.dtypes

floats = data_types[data_types == 'float64'].index.values.tolist()
ints = data_types[data_types == 'int64'].index.values.tolist()

In [15]:
data_types.value_counts()

int64      110
float64     18
int32        1
dtype: int64

In [16]:
data_types[data_types == 'object'].index.values.tolist()

[]

### FIX THE OBJECT DATA TYPE

In [11]:
#parse out the product info field

all_data['Product_Info_2_char'] = all_data['Product_Info_2'].str[0]
all_data['Product_Info_2_num'] = all_data['Product_Info_2'].str[1]

In [12]:
all_data['Product_Info_2_char'] = pd.factorize(all_data['Product_Info_2_char'])[0]
all_data['Product_Info_2_num'] = pd.factorize(all_data['Product_Info_2_num'])[0]
all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]

In [13]:
all_data['Product_Info_2'].value_counts()

0     18753
3     14071
5      9140
7      8611
4      8344
2      3711
1      3219
6      3072
9      2733
8      1823
10     1564
14     1446
11     1009
15      437
13      377
12      291
17      263
16      197
18       85
Name: Product_Info_2, dtype: int64

## CARRY ON WITH PIPELINE

In [22]:
# perform test and train split. IGNORE THE TEST DATA FOR NOW AS THIS IS FOR THE COMPETITION

col = 'Response'

new_train = all_data[all_data[col] > 0]

new_X = new_train.loc[:,all_data.columns != col].copy()
new_y = new_train.loc[:,all_data.columns == col].copy()

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(new_X,new_y,test_size = 0.25,random_state = 42)

In [29]:
X_train.dtypes.value_counts()

int64      110
float64     18
dtype: int64

In [99]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

pipeline =  Pipeline([
                ('imputer', Imputer(strategy='median')),
                ('standard', StandardScaler()),
                ('classifier', GradientBoostingClassifier(random_state = 42))
            ])

In [100]:
pipeline.fit(X_train,y_train.values.ravel())

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('standard', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance',...        presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False))])

In [101]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'imputer', 'standard', 'classifier', 'imputer__axis', 'imputer__copy', 'imputer__missing_values', 'imputer__strategy', 'imputer__verbose', 'standard__copy', 'standard__with_mean', 'standard__with_std', 'classifier__criterion', 'classifier__init', 'classifier__learning_rate', 'classifier__loss', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__presort', 'classifier__random_state', 'classifier__subsample', 'classifier__verbose', 'classifier__warm_start'])

In [102]:
from sklearn.model_selection import GridSearchCV

params = dict(classifier__max_depth = [3,5,7]
             ,classifier__min_samples_leaf = [3,5])

clf = GridSearchCV(pipeline,params,cv = 5)

In [103]:
clf.fit(X_train,y_train.values.ravel())

KeyboardInterrupt: 

In [44]:
clf.best_params_

{'classifier__max_depth': 7, 'classifier__min_samples_leaf': 7}

In [57]:
clf.refit

y_test_preds = clf.predict(X_test)

### assess the performance of the model

In [62]:
from sklearn.metrics import accuracy_score,confusion_matrix,cohen_kappa_score

accuracy_score(y_test,y_test_preds)

0.49090664151960123

In [63]:
print(confusion_matrix(y_test,y_test_preds))

[[ 153  212   13   29  101  418  119  563]
 [  87  294    9   21  131  514   91  501]
 [  36   16   46   67   25   28    2   32]
 [  22    5   11  100    1   37   10  173]
 [  24   89    0    0  514  479   20  263]
 [  39   74    0    5   43 1301  135 1205]
 [  11    2    0    1   11  681  320  942]
 [   1    0    0    0   11  209   39 4560]]


In [64]:
cohen_kappa_score(y_test,y_test_preds)

0.3239197116491541

## see how this does on submission

In [67]:
test.set_index('Id',inplace = True)

In [68]:
test.head()

Unnamed: 0_level_0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,D3,26,0.487179,2,3,1,0.61194,0.781818,0.338912,...,0,0,0,0,0,0,0,0,0,0
3,1,A2,26,0.076923,2,3,1,0.626866,0.727273,0.311715,...,0,0,0,0,0,0,0,0,0,0
4,1,D3,26,0.144667,2,3,1,0.58209,0.709091,0.320084,...,0,0,0,0,0,0,0,0,0,0
9,1,A1,26,0.151709,2,1,1,0.522388,0.654545,0.267782,...,0,0,0,0,0,0,0,0,1,1
12,1,A1,26,0.076923,2,3,1,0.298507,0.672727,0.246862,...,0,0,0,0,0,0,0,0,0,0


In [70]:
#just need to clean the Product_Info_2 field

test['Product_Info_2_char'] = test['Product_Info_2'].str[0]
test['Product_Info_2_num'] = test['Product_Info_2'].str[1]

In [71]:
test['Product_Info_2_char'] = pd.factorize(test['Product_Info_2_char'])[0]
test['Product_Info_2_num'] = pd.factorize(test['Product_Info_2_num'])[0]
test['Product_Info_2'] = pd.factorize(test['Product_Info_2'])[0]

In [73]:
submission_vals = clf.predict(test)

In [92]:
submissions = pd.DataFrame(data = submission_vals,index=test.index,columns = ['Response'])

In [94]:
submissions['Response'].value_counts()

2    15914
1     3615
5      136
7       54
6       32
3       14
Name: Response, dtype: int64

In [97]:
submissions.to_csv('~\Desktop\Prudential_submission.csv')