In [20]:
# Import all required libraries
from __future__ import division # For python 2.*

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

np.random.seed(0)
%matplotlib inline

X = np.genfromtxt('data/X_train.txt', delimiter=None) 
Y = np.genfromtxt('data/Y_train.txt', delimiter=None) 
X,Y = ml.shuffleData(X,Y)
Xtr, Xva, Ytr, Yva = ml.splitData(X,Y,0.75)
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

In [31]:
n_bags = 100
bags = []   # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data. 
    Xi, Yi = ml.bootstrapData(Xtr, Ytr)

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi, Yi,minParent=2**6, maxDepth=25,nFeatures=6)
    bags.append(tree)

In [32]:
for l in range(n_bags):
    print(l)
    print("{0:>15}: {1:.4f}".format('Train AUC', bags[l].auc(Xtr, Ytr)))
    print("{0:>15}: {1:.4f}".format('Validation AUC', bags[l].auc(Xva, Yva)))

0
      Train AUC: 0.7853
 Validation AUC: 0.6734
1
      Train AUC: 0.7835
 Validation AUC: 0.6684
2
      Train AUC: 0.7885
 Validation AUC: 0.6755
3
      Train AUC: 0.7869
 Validation AUC: 0.6758
4
      Train AUC: 0.7832
 Validation AUC: 0.6695
5
      Train AUC: 0.7889
 Validation AUC: 0.6742
6
      Train AUC: 0.7825
 Validation AUC: 0.6675
7
      Train AUC: 0.7840
 Validation AUC: 0.6744
8
      Train AUC: 0.7828
 Validation AUC: 0.6706
9
      Train AUC: 0.7837
 Validation AUC: 0.6718
10
      Train AUC: 0.7919
 Validation AUC: 0.6790
11
      Train AUC: 0.7826
 Validation AUC: 0.6596
12
      Train AUC: 0.7819
 Validation AUC: 0.6702
13
      Train AUC: 0.7783
 Validation AUC: 0.6645
14
      Train AUC: 0.7850
 Validation AUC: 0.6678
15
      Train AUC: 0.7846
 Validation AUC: 0.6705
16
      Train AUC: 0.7775
 Validation AUC: 0.6570
17
      Train AUC: 0.7864
 Validation AUC: 0.6744
18
      Train AUC: 0.7877
 Validation AUC: 0.6773
19
      Train AUC: 0.7882
 Validation AU

In [33]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [34]:
bt = BaggedTree(bags)
bt.classes = np.unique(Y)

probs = bt.predictSoft(Xte)

print(probs)
print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))

[[ 0.27020089  0.72979911]
 [ 0.54913187  0.45086813]
 [ 0.73793351  0.26206649]
 ..., 
 [ 0.77125683  0.22874317]
 [ 0.87310097  0.12689903]
 [ 0.83103246  0.16896754]]
      Train AUC: 0.9129
 Validation AUC: 0.7615


In [35]:
n_bags = 100
bags = []   # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data. 
    Xi, Yi = ml.bootstrapData(X, Y)

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6, maxDepth=25, nFeatures=6)
    bags.append(tree)

In [36]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)
Yte = np.vstack((np.arange(Xte.shape[0]), bt.predictSoft(Xte)[:,1])).T
# Output a file with two columns, a row ID and a confidence in class 1:
np.savetxt('Y_submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')