# predictive modeling

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train1 = train.fillna(train.mean())
test1 = test.fillna(test.min())

In [7]:
x_train = train1.iloc[:,:-2]
y_train = train1['loanstatus']
x_test = test1.iloc[:,:-2]
y_test = test1['loanstatus']

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV



## logistic regression

In [103]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
score = logisticRegr.score(x_test, y_test)
print(score)

0.7974285923


In [104]:
y_pred=logisticRegr.predict(x_test)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_pred, y_test)

0.3987725579016585

## Random Forest

In [95]:
# Initialize classifier object
clf = RandomForestClassifier()
# Specify hyperparameter space to optimize
param_dist = {"max_depth": [3, None],
              "n_estimators": [100, 300, 600],
              "max_features": [1, 3, 5],
              "min_samples_split": [2, 4, 7],
              "min_samples_leaf": [1, 3, 5],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
# Run randomized gridsearch
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)


In [96]:
best_model = random_search.fit(x_train, y_train)

In [114]:
print best_model.score(x_test,y_test)
#y_pred=best_model.predict(x_test)
#from sklearn.metrics import roc_auc_score
#print roc_auc_score(y_pred, y_test)

0.797574695011


## xgboost

In [115]:
import xgboost as xgb
clf2 = xgb.XGBClassifier()

param_grid = {
        'silent': [False],
        'max_depth': [6, 10, 15],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'gamma': [0, 0.25, 0.5, 1.0],
        'n_estimators': [100]}

rs_clf2 = RandomizedSearchCV(clf2, param_grid, n_iter=20,
                            n_jobs=1, verbose=2, cv=2,
                            refit=True, random_state=42)
rs_clf2.fit(x_train, y_train)

best_score = rs_clf2.best_score_
best_params = rs_clf2.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] silent=False, learning_rate=0.2, n_estimators=100, subsample=0.9, max_depth=10, gamma=0.5 
[12:12:29] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 840 extra nodes, 6 pruned nodes, max_depth=10
[12:12:31] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 798 extra nodes, 8 pruned nodes, max_depth=10
[12:12:32] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 808 extra nodes, 22 pruned nodes, max_depth=10
[12:12:34] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 792 extra nodes, 14 pruned nodes, max_depth=10
[12:12:35] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 922 extra nodes, 28 pruned nodes, max_depth=10
[12:12:37] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 874 extra nodes, 36 pruned nodes, max_depth=10
[12:12:39] C:\dev\libs\xgboost\src\

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[12:13:49] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 862 extra nodes, 12 pruned nodes, max_depth=10
[12:13:49] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 890 extra nodes, 14 pruned nodes, max_depth=10
[12:13:50] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 854 extra nodes, 18 pruned nodes, max_depth=10
[12:13:51] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 882 extra nodes, 24 pruned nodes, max_depth=10
[12:13:52] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 888 extra nodes, 30 pruned nodes, max_depth=10
[12:13:52] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 932 extra nodes, 18 pruned nodes, max_depth=10
[12:13:54] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 918 extra nodes, 36 pruned nodes, max_depth=10
[12:13:57] C:\dev\libs\xgboost\src\tree\updater_prune.c

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 125.2min finished


[14:17:45] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[14:17:46] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 0 pruned nodes, max_depth=6
[14:17:48] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[14:17:52] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[14:17:54] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 122 extra nodes, 0 pruned nodes, max_depth=6
[14:17:58] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=6
[14:18:00] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 118 extra nodes, 2 pruned nodes, max_depth=6
[14:18:03] C:\dev\libs\xgboost\src\tree\updater_prune.cc:74: tree pru

In [119]:
rs_clf2.score(x_test, y_test)

0.79757469501059242