## import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# get prudential & test csv files as a DataFrame
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# preview the data
train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,2,1,D3,10,0.076923,2,1,1,0.641791,0.581818,...,0,0,0,0,0,0,0,0,0,8
1,5,1,A1,26,0.076923,2,3,1,0.059701,0.6,...,0,0,0,0,0,0,0,0,0,4
2,6,1,E1,26,0.076923,2,3,1,0.029851,0.745455,...,0,0,0,0,0,0,0,0,0,8
3,7,1,D4,10,0.487179,2,3,1,0.164179,0.672727,...,0,0,0,0,0,0,0,0,0,8
4,8,1,D2,26,0.230769,2,3,1,0.41791,0.654545,...,0,0,0,0,0,0,0,0,0,8


In [4]:
train.set_index('Id',inplace  = True)
test.set_index('Id',inplace  = True)

In [5]:
datatypes = train.dtypes
missing_vals = train.isnull().sum()

In [6]:
types_missing = pd.concat([datatypes,missing_vals],axis = 1)

types_missing.columns = ['type','missing']

### tidy up object data type

In [8]:
types_missing[types_missing['type'] == 'object']

Unnamed: 0,type,missing
Product_Info_2,object,0


In [10]:
train['Product_Info_2_char'] = train['Product_Info_2'].str[0]
train['Product_Info_2_num'] = train['Product_Info_2'].str[1]

train['Product_Info_2'] = pd.factorize(train['Product_Info_2'])[0]
train['Product_Info_2_char'] = pd.factorize(train['Product_Info_2_char'])[0]
train['Product_Info_2_num'] = pd.factorize(train['Product_Info_2_num'])[0]

In [11]:
test['Product_Info_2_char'] = test['Product_Info_2'].str[0]
test['Product_Info_2_num'] = test['Product_Info_2'].str[1]

test['Product_Info_2'] = pd.factorize(test['Product_Info_2'])[0]
test['Product_Info_2_char'] = pd.factorize(test['Product_Info_2_char'])[0]
test['Product_Info_2_num'] = pd.factorize(test['Product_Info_2_num'])[0]

### look at missing data

In [12]:
types_missing[types_missing['missing'] > 0].sort_values(by = 'missing')

Unnamed: 0,type,missing
Employment_Info_1,float64,19
Employment_Info_4,float64,6779
Medical_History_1,float64,8889
Employment_Info_6,float64,10854
Family_Hist_4,float64,19184
Insurance_History_5,float64,25396
Family_Hist_2,float64,28656
Family_Hist_3,float64,34241
Family_Hist_5,float64,41811
Medical_History_15,float64,44596


In [13]:
fifty_pc = train.shape[0]//2

todrop = types_missing[types_missing['missing'] > fifty_pc].index.values.tolist()

In [14]:
valid_types = types_missing.drop(todrop,axis = 0)

In [17]:
valid_types.head()

Unnamed: 0,type,missing
Product_Info_1,int64,0
Product_Info_2,object,0
Product_Info_3,int64,0
Product_Info_4,float64,0
Product_Info_5,int64,0


## Missing Values and scale

In [33]:
from sklearn.preprocessing import Imputer,StandardScaler

imputer = Imputer()
scaler = StandardScaler()

#impute first to get rid of values

In [36]:
X = imputer.fit_transform(train[valid_types.index.values.tolist()[:-1]])
X = scaler.fit_transform(X)
X

array([[-0.16452547, -1.13405144, -2.84173109, ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -0.85241674,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -0.57078205,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       ...,
       [-0.16452547, -0.57078205,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -0.00751266, -2.84173109, ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547,  0.27412203,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598]])

In [40]:
y = train['Response']

### train the model

In [41]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state = 42)

In [119]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(100,100,100,100),max_iter=500,alpha=0.01
                    ,solver='adam',learning_rate = 'adaptive',verbose=True,random_state=21,tol=0.000000000001
                   ,early_stopping = True,validation_fraction = 0.1)

In [120]:
clf.fit(X_train,y_train)

Iteration 1, loss = 1.56883641
Validation score: 0.468711
Iteration 2, loss = 1.39562201
Validation score: 0.490324
Iteration 3, loss = 1.34309990
Validation score: 0.487811
Iteration 4, loss = 1.31351599
Validation score: 0.492335
Iteration 5, loss = 1.28700079
Validation score: 0.485046
Iteration 6, loss = 1.26329745
Validation score: 0.499623
Iteration 7, loss = 1.24087224
Validation score: 0.489319
Iteration 8, loss = 1.22362708
Validation score: 0.477256
Iteration 9, loss = 1.20074135
Validation score: 0.490073
Validation score did not improve more than tol=0.000000 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 100), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=21, shuffle=True,
       solver='adam', tol=1e-12, validation_fraction=0.1, verbose=True,
       warm_start=False)

In [115]:
y_pred = clf.predict(X_test)

In [116]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

accuracy_score(y_test,y_pred)

0.4113084302918963

In [104]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          1       0.38      0.27      0.32      2071
          2       0.38      0.25      0.30      2146
          3       0.24      0.05      0.08       332
          4       0.24      0.17      0.20       451
          5       0.49      0.32      0.39      1816
          6       0.41      0.43      0.42      3715
          7       0.39      0.37      0.38      2580
          8       0.64      0.87      0.74      6485

avg / total       0.48      0.51      0.48     19596



In [105]:
y_train_pred = clf.predict(X_train)

print(classification_report(y_train_pred,y_train))

             precision    recall  f1-score   support

          1       0.38      0.51      0.43      3078
          2       0.33      0.52      0.41      2842
          3       0.09      0.45      0.16       143
          4       0.23      0.37      0.28       599
          5       0.38      0.56      0.45      2433
          6       0.51      0.49      0.50      7837
          7       0.45      0.49      0.47      5042
          8       0.90      0.66      0.76     17811

avg / total       0.64      0.57      0.59     39785



## submission

In [106]:
test_X = imputer.transform(test[valid_types.index.values.tolist()[:-1]])
test_X = scaler.transform(test_X)
test_X

array([[-0.16452547, -1.13405144,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -0.85241674,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -1.13405144,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       ...,
       [-0.16452547, -0.85241674,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547,  0.27412203,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598],
       [-0.16452547, -1.13405144,  0.312319  , ..., -0.09252145,
        -0.14251185, -0.24007598]])

In [107]:
test_predictions = clf.predict(test_X)

In [108]:
submission = pd.DataFrame(data = test_predictions, index = test.index, columns = ['Response'])

submission['Response'].value_counts(normalize = True)

8    0.445434
6    0.197015
7    0.128712
1    0.078877
2    0.067898
5    0.063243
4    0.015178
3    0.003643
Name: Response, dtype: float64

In [99]:
submission.to_csv('~\Desktop\Prudential_submission.csv')