## import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# get prudential & test csv files as a DataFrame
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# preview the data
train.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,2,1,D3,10,0.076923,2,1,1,0.641791,0.581818,...,0,0,0,0,0,0,0,0,0,8
1,5,1,A1,26,0.076923,2,3,1,0.059701,0.6,...,0,0,0,0,0,0,0,0,0,4
2,6,1,E1,26,0.076923,2,3,1,0.029851,0.745455,...,0,0,0,0,0,0,0,0,0,8
3,7,1,D4,10,0.487179,2,3,1,0.164179,0.672727,...,0,0,0,0,0,0,0,0,0,8
4,8,1,D2,26,0.230769,2,3,1,0.41791,0.654545,...,0,0,0,0,0,0,0,0,0,8


In [3]:
train.set_index('Id',inplace  = True)
test.set_index('Id',inplace  = True)

In [4]:
datatypes = train.dtypes
missing_vals = train.isnull().sum()

In [5]:
types_missing = pd.concat([datatypes,missing_vals],axis = 1)

types_missing.columns = ['type','missing']

### tidy up object data type

In [6]:
types_missing[types_missing['type'] == 'object']

Unnamed: 0,type,missing
Product_Info_2,object,0


In [7]:
train['Product_Info_2_char'] = train['Product_Info_2'].str[0]
train['Product_Info_2_num'] = train['Product_Info_2'].str[1]

train['Product_Info_2'] = pd.factorize(train['Product_Info_2'])[0]
train['Product_Info_2_char'] = pd.factorize(train['Product_Info_2_char'])[0]
train['Product_Info_2_num'] = pd.factorize(train['Product_Info_2_num'])[0]

In [8]:
test['Product_Info_2_char'] = test['Product_Info_2'].str[0]
test['Product_Info_2_num'] = test['Product_Info_2'].str[1]

test['Product_Info_2'] = pd.factorize(test['Product_Info_2'])[0]
test['Product_Info_2_char'] = pd.factorize(test['Product_Info_2_char'])[0]
test['Product_Info_2_num'] = pd.factorize(test['Product_Info_2_num'])[0]

### look at missing data

In [9]:
types_missing[types_missing['missing'] > 0].sort_values(by = 'missing')

Unnamed: 0,type,missing
Employment_Info_1,float64,19
Employment_Info_4,float64,6779
Medical_History_1,float64,8889
Employment_Info_6,float64,10854
Family_Hist_4,float64,19184
Insurance_History_5,float64,25396
Family_Hist_2,float64,28656
Family_Hist_3,float64,34241
Family_Hist_5,float64,41811
Medical_History_15,float64,44596


In [10]:
fifty_pc = train.shape[0]//2

todrop = types_missing[types_missing['missing'] > fifty_pc].index.values.tolist()

In [11]:
valid_types = types_missing.drop(todrop,axis = 0)

### impute medians for floats

In [12]:
from sklearn.preprocessing import Imputer,StandardScaler
from sklearn.pipeline import Pipeline,FeatureUnion

In [13]:
train.dtypes.value_counts()

int64      111
float64     18
dtype: int64

In [14]:
floats = valid_types[valid_types['type'] == 'float64'].index.values.tolist()

In [15]:
pipeline = Pipeline([
    ('imputer',Imputer(strategy = 'median'))
    ,('scaler',StandardScaler())
])

In [16]:
floats_train = pipeline.fit(train[floats])

In [17]:
floats_train

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])

### deal with floats

In [18]:
ints = valid_types[valid_types['type'] == 'int64'].index.values.tolist()

In [19]:
ints.remove('Response')

In [20]:
ints_train = pipeline.named_steps['imputer'].fit(train[ints])

In [21]:
ints_train

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

### feature union

In [22]:
features = FeatureUnion([
    ('floats',floats_train)
    ,('ints',ints_train)
])

In [23]:
feature_processing = Pipeline([('feats', features)])
feature_processing.fit(train[floats + ints])

Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('floats', Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('ints', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0))],
       transformer_weights=None))])

## build a model

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score,GridSearchCV, train_test_split

In [33]:
X_train,X_test,y_train,y_test = train_test_split(train.loc[:,train.columns != 'Response']
                                                 ,train.loc[:,train.columns == 'Response']
                                                ,test_size = 0.33)

In [34]:
model = Pipeline([
    
    ('feats',features)
    ,('clf',SVC())
])

In [42]:
param_grid={'estimator__clf__C':[0.05,1]}

model_CV = GridSearchCV(model,param_grid,cv=10)

In [43]:
model_CV.fit(X_train,y_train.values.ravel())

ValueError: Invalid parameter estimator for estimator Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=1,
       transformer_list=[('floats', Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('ints', Imputer(axis=0,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [30]:
model_CV.refit

test_pred = model_CV.predict(X_test)
accuracy_score(y_test,test_pred)

0.5076035925699123

## Bring in test data and submit

In [None]:
predictions = model_CV.predict(test)

In [None]:
submission = pd.DataFrame(data = predictions,index=test.index,columns = ['Response'])

In [None]:
submission['Response'].value_counts()

In [None]:
submission.to_csv('~\Desktop\Prudential_submission.csv')