### Input data

In [1]:
import pandas as pd
import numpy as np

#### Set target, id and features

In [30]:
len(test)

892816

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train_id=train['id']
train_target = train['target']
train = train.drop(['id','target'],axis=1)

In [4]:
test_id=test['id']
test = test.drop('id',axis=1)

### Deal with missing data

#### Replace -1 values with nan
From description: "Values of -1 indicate that the feature was missing from the observation."

In [7]:
train = train.replace(to_replace=-1,value=np.nan)
test = test.replace(to_replace=-1,value=np.nan)

#### Missing data 1: drop columns with 20%+ missing data

In [8]:
#Columns whereGreater than 20% missing
incomplete_columns = list(train.isnull().sum()[train.isnull().sum()>len(train)*.2].index)

train = train.drop(incomplete_columns,axis=1)
test = test.drop(incomplete_columns, axis=1)

#### Missing data 2: impute median values

In [9]:
train = train.fillna(train.median())
test = test.fillna(test.median())

### Convert categorical features to binary values

In [10]:
cat_feat = [col for col in train.columns if col.endswith('cat')]

In [11]:
for df in [train,test]:
    for column in cat_feat:
        dummies = pd.get_dummies(df[column],drop_first=True)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop([column],axis=1)

### Scaling data

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()   #Initalize scaler estimator

scaler.fit(train) #Remember to only fit scaler to training datab

StandardScaler(copy=True, with_mean=True, with_std=True)

In [52]:
train = pd.DataFrame(scaler.transform(train),columns=train.columns)
test = pd.DataFrame(scaler.transform(test),columns=test.columns)

### PCA

In [59]:
from sklearn.decomposition import PCA
pca = PCA(n_components=.95)

pca.fit(train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [60]:
pca.n_components_

46

In [58]:
train.shape

(595212, 55)

In [63]:
pca_train = pca.transform(train)

Not going with PCA at this time

### Machine Learning

In [12]:
#from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
#X_train, X_valid, y_train, y_valid = train_test_split(train, train_target, test_size=0.2, random_state=42)

In [92]:
grad = GradientBoostingClassifier(verbose=2,n_estimators=500,max_depth=6,max_features=5,
                                  min_samples_leaf=50,min_samples_split=70)

In [93]:
grad.fit(train,train_target)

      Iter       Train Loss   Remaining Time 
         1           0.3118           13.70m
         2           0.3111           12.21m
         3           0.3104           13.34m
         4           0.3097           13.58m
         5           0.3092           13.29m
         6           0.3086           13.05m
         7           0.3082           13.10m
         8           0.3077           13.31m
         9           0.3072           13.37m
        10           0.3068           13.42m
        11           0.3064           13.21m
        12           0.3061           13.13m
        13           0.3057           12.96m
        14           0.3055           12.74m
        15           0.3052           12.57m
        16           0.3050           12.39m
        17           0.3048           12.22m
        18           0.3046           12.17m
        19           0.3044           12.00m
        20           0.3041           11.91m
        21           0.3039           11.88m
        2

       183           0.2921            6.03m
       184           0.2921            6.02m
       185           0.2921            5.99m
       186           0.2920            5.96m
       187           0.2920            5.94m
       188           0.2919            5.92m
       189           0.2918            5.90m
       190           0.2918            5.88m
       191           0.2918            5.85m
       192           0.2917            5.83m
       193           0.2917            5.81m
       194           0.2916            5.79m
       195           0.2916            5.77m
       196           0.2915            5.74m
       197           0.2915            5.72m
       198           0.2914            5.69m
       199           0.2914            5.68m
       200           0.2913            5.66m
       201           0.2912            5.64m
       202           0.2912            5.62m
       203           0.2911            5.60m
       204           0.2911            5.58m
       205

       366           0.2838            2.45m
       367           0.2837            2.43m
       368           0.2837            2.41m
       369           0.2837            2.39m
       370           0.2836            2.37m
       371           0.2835            2.35m
       372           0.2835            2.33m
       373           0.2835            2.32m
       374           0.2834            2.30m
       375           0.2834            2.28m
       376           0.2833            2.26m
       377           0.2833            2.24m
       378           0.2833            2.23m
       379           0.2832            2.21m
       380           0.2831            2.19m
       381           0.2831            2.17m
       382           0.2830            2.16m
       383           0.2830            2.14m
       384           0.2829            2.12m
       385           0.2829            2.10m
       386           0.2828            2.08m
       387           0.2828            2.07m
       388

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=5, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=50,
              min_samples_split=70, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=1.0, verbose=2, warm_start=False)

In [94]:
pred.shape

(892816, 2)

In [95]:
pred = grad.predict_proba(test)

grad_pred = pred[:,1]

### Random search for best hyperparameters

In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [73]:
#"learning_rate":[0.1,0.2,0.3,0.4],

#param_dist = {"max_depth": [3,5,10],
 #             "max_features":[2,4,6,8],          
#             "min_samples_split":[2,50,100]}

param_dist = {"max_depth": [6,7],
              "max_features":[4,5,6],
              "min_samples_leaf":[1,10,30,50,100],
             "min_samples_split":[40,50,60,70]}

best_params = {
    "max_depth":6,
    "max_features":5,
    "min_samples_leaf":50,
    "min_samples_split":70
}

In [77]:
rand_search = RandomizedSearchCV(grad,param_dist,cv=3,verbose=1,n_iter=3,n_jobs=-1)

In [78]:
rand_search.fit(train,train_target)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
         1           0.3116           25.34s
      Iter       Train Loss   Remaining Time 
         1           0.3116           30.24s
         1           0.3117           28.73s
         1           0.3118           20.95s
         2           0.3105           22.57s
         2           0.3106           25.59s
         2           0.3111           17.38s
         2           0.3106           24.09s
         3           0.3097           19.85s
         3           0.3102           15.76s
         3           0.3098           22.10s
         3           0.3098           20.84s
         4           0.3090           16.93s
         4           0.3096           13.75s
         4           0.3090           18.57s
         4           0.3091           17.60s
         5           0.3082          

[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.3min finished


      Iter       Train Loss   Remaining Time 
         1           0.3120           10.68s
         2           0.3112            9.85s
         3           0.3106            8.53s
         4           0.3098            7.44s
         5           0.3091            6.06s
         6           0.3085            4.92s
         7           0.3080            3.74s
         8           0.3075            2.67s
         9           0.3072            1.36s
        10           0.3068            0.00s


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=20, min_weight_fraction_leaf=0.0,
              n_estimators=10, presort='auto', random_state=None,
              subsample=1.0, verbose=5, warm_start=False),
          fit_params={}, iid=True, n_iter=3, n_jobs=-1,
          param_distributions={'max_depth': [4, 5, 6], 'max_features': [5, 6, 7], 'min_samples_leaf': [1, 10, 50, 100, 1000], 'min_samples_split': [40, 50, 60, 70]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [89]:
best_pred = rand_search.best_estimator_.predict_proba(test)

best_pred = best_pred[:,1]

In [81]:
rand_search.best_score_

0.96355248214081701

In [79]:
rand_search.best_estimator_.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': 5,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 50,
 'min_samples_split': 70,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 5,
 'warm_start': False}

## 7. Create output csv

In [96]:
sub = pd.DataFrame({'id':test_id,'target':grad_pred})

sub.to_csv('Grad Boost 9.csv',index=False)