## 1. Big picture

Determine likelihood of driver filing a claim.

## 2. Get the data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train = pd.read_csv('train.csv')

In [4]:
test = pd.read_csv('test.csv')

In [5]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [6]:
train.columns

Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],


## 3. Explore the data

In [7]:
df = train.copy()

### Differing columns

In [8]:
[column for column in test.columns if column not in train.columns]

[]

In [9]:
[column for column in df.columns if column not in test.columns]

['target']

In [10]:
#train.hist(bins=20,figsize=(20,20))

In [11]:
#df.corr()['target'].sort_values(ascending=False)

From description: "Values of -1 indicate that the feature was missing from the observation."

In [12]:
df = df.replace(to_replace=-1,value=np.nan)

In [13]:
test = test.replace(to_replace=-1,value=np.nan)

In [14]:
test.isnull().sum()[test.isnull().sum()>0].sort_values(ascending=False)

ps_car_03_cat    616911
ps_car_05_cat    400359
ps_reg_03        161684
ps_car_14         63805
ps_car_07_cat     17331
ps_ind_05_cat      8710
ps_car_09_cat       877
ps_ind_02_cat       307
ps_car_01_cat       160
ps_ind_04_cat       145
ps_car_02_cat         5
ps_car_11             1
dtype: int64

In [15]:
#Check for missing values
df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending=False)

ps_car_03_cat    411231
ps_car_05_cat    266551
ps_reg_03        107772
ps_car_14         42620
ps_car_07_cat     11489
ps_ind_05_cat      5809
ps_car_09_cat       569
ps_ind_02_cat       216
ps_car_01_cat       107
ps_ind_04_cat        83
ps_car_11             5
ps_car_02_cat         5
ps_car_12             1
dtype: int64

## 4. Prepare the data

#### Missing data 1: drop columns with 20%+ missing data

In [16]:
#Columns whereGreater than 20% missing
df.isnull().sum()[df.isnull().sum()>len(df)*.2]

ps_car_03_cat    411231
ps_car_05_cat    266551
dtype: int64

In [17]:
df.drop(['ps_car_03_cat','ps_car_05_cat'],axis=1,inplace=True)
test.drop(['ps_car_03_cat','ps_car_05_cat'],axis=1,inplace=True)

#### Missing data 2: impute median values

In [18]:
df = df.fillna(df.median())
test = test.fillna(df.median())

#### Deal with non-numeric features

In [19]:
set(df.dtypes)

{dtype('int64'), dtype('float64')}

#### Seperate ids, features, and labels

In [20]:
ids = df.iloc[:,0]
labels = df.iloc[:,1]
features = df.iloc[:,2:]

In [21]:
test_ids = test.iloc[:,0]
test_features = test.iloc[:,1:]

## 5. Short-list promising models

In [22]:
"""
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
"""

'\nfrom sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.neighbors import KNeighborsClassifier\n'

In [23]:
"""
#Ensemble
rfc = RandomForestClassifier()
ada_boost = AdaBoostClassifier()
bag = BaggingClassifier()
extra_trees = ExtraTreesClassifier()
grad_boost = GradientBoostingClassifier(n_estimators=1)

#SVM
svc = SVC()

#Neigbors
knn = KNeighborsClassifier()

classifiers = [rfc,ada_boost,bag,extra_trees,grad_boost,svc,knn]
classifiers = [grad_boost]
"""

'\n#Ensemble\nrfc = RandomForestClassifier()\nada_boost = AdaBoostClassifier()\nbag = BaggingClassifier()\nextra_trees = ExtraTreesClassifier()\ngrad_boost = GradientBoostingClassifier(n_estimators=1)\n\n#SVM\nsvc = SVC()\n\n#Neigbors\nknn = KNeighborsClassifier()\n\nclassifiers = [rfc,ada_boost,bag,extra_trees,grad_boost,svc,knn]\nclassifiers = [grad_boost]\n'

In [24]:
def run_estimators(prepared_features,labels,cross_validation=5):
    highest_score=0
    best_classifier=""
    best_classifier_training_score=0

    for clf in classifiers:
        clf.fit(prepared_features,labels)

        pred = clf.predict(prepared_features)

        scores = cross_val_score(clf, prepared_features, labels, cv=cross_validation)

        training_score=clf.score(prepared_features,labels)
        cv_score_mean = scores.mean()

        if cv_score_mean>highest_score:
            highest_score=cv_score_mean
            best_classifier=clf
            best_classifier_training_score=training_score

        print("\n{}:\n\n\tTraining Score: {} \n\tCV Scores: {}".format(clf,training_score,cv_score_mean))

    print('\n\n----------------\nBest estimator:{}\n\n\tTraining Score: {} \n\tCV Scores: {}'.format(best_classifier,highest_score,best_classifier_training_score))

In [25]:
#run_estimators(features,labels,cross_validation=3)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
grad = GradientBoostingClassifier(verbose=5,n_estimators=100)

In [None]:


grad.fit(features,labels)

In [None]:
pred = grad.predict_proba(test_features)

grad_pred = pred[:,1]

In [None]:
import xgboost as xgb

In [None]:
"""
xg = xgb.XGBClassifier()
xg.fit(features,labels)
xg.score(features,labels)
"""

In [None]:
params = {'obj':'multi:softprob'}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features.values,labels.values, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train,label=y_train)
dtest = xgb.DMatrix(X_test,label=y_test)

watchlist = [(dtrain,'dtrain'),(dtest,'dtest')]

In [None]:
xgb_clf = xgb.train(params=params,
                 dtrain=dtrain,
                 num_boost_round=200,
                 evals=watchlist,
                 early_stopping_rounds=40,
                 maximize=False,
                verbose_eval=10)

In [None]:
d_test_features = xgb.DMatrix(test_features.values)

In [None]:
pred = xgb_clf.predict(d_test_features)

pred = pred.clip(0) #remove values less than 0 and replace them with 0

## 6 Fine-tune the system

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [35]:


#rand_search.fit(passengers_prepared,passengers_labels)

In [38]:
param_grid = [
    {'max_depth':[1,3,10],'max_features':[2,4,6,8],'min_samples_split':[2,4]}
]

In [39]:
param_dist = {"max_depth": [3,5,10],
             "learning_rate":[0.1,0.2,0.3,0.4],
             "min_samples_split":[2,50,100]}

In [40]:
rand_search = RandomizedSearchCV(grad,param_dist,cv=5,verbose=3)

In [41]:
rand_search.fit(features,labels)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] min_samples_split=2, max_depth=3, learning_rate=0.3 .............
      Iter       Train Loss   Remaining Time 
         1           0.3108            2.43m
         2           0.3092            2.52m
         3           0.3083            2.47m
         4           0.3076            2.46m
         5           0.3070            2.41m
         6           0.3065            2.41m
         7           0.3061            2.32m
         8           0.3058            2.32m
         9           0.3055            2.27m
        10           0.3052            2.26m
        11           0.3050            2.24m
        12           0.3048            2.23m
        13           0.3047            2.21m
        14           0.3045            2.18m
        15           0.3043            2.13m
        16           0.3041            2.10m


KeyboardInterrupt: 

In [None]:
grid_search = GridSearchCV(grad_boost,param_grid,cv=5,verbose=3)

In [None]:
grid_search.fit(features,labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
best_grad = grid_search.best_estimator_

## 7. Create output csv

In [None]:
sub = pd.DataFrame({'id':test_ids,'target':grad_pred})

sub.to_csv('Grad Boost 3 - Randomized search.csv',index=False)