Team: Sameer's Angels

Members:  
Patrick Jose (20947156)  
Christian Poon (79555434)  
Tri Hoang (15681623)

In [2]:
# Import all required libraries
from __future__ import division # For python 2.*

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

np.random.seed(0)
%matplotlib inline

In [3]:
X = np.genfromtxt("data/X_train.txt",delimiter=None)
Y = np.genfromtxt("data/Y_train.txt",delimiter=None)
X,Y = ml.shuffleData(X,Y)

Xtr,Xva,Ytr,Yva = ml.splitData(X,Y)
Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

Xte = np.genfromtxt('data/X_test.txt',delimiter=None)

#XtrS, params = ml.rescale(Xtr) # Normalize the features
#XvS, _ = ml.rescale(Xva, params) # Normalize the features

# Random Forest

In [4]:
n_bags = 20
bags = []   # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data. 
    Xi, Yi = ml.bootstrapData(Xtr, Ytr, X.shape[0] // 2)

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6, maxDepth=25, nFeatures=6)
    bags.append(tree)

In [5]:
for l in range(n_bags):
    print(l)
    print("{0:>15}: {1:.4f}".format('Train AUC', bags[l].auc(Xtr, Ytr)))
    print("{0:>15}: {1:.4f}".format('Validation AUC', bags[l].auc(Xva, Yva)))

0
      Train AUC: 0.7489
 Validation AUC: 0.6680
1
      Train AUC: 0.7580
 Validation AUC: 0.6743
2
      Train AUC: 0.7587
 Validation AUC: 0.6777
3
      Train AUC: 0.7525
 Validation AUC: 0.6674
4
      Train AUC: 0.7541
 Validation AUC: 0.6729
5
      Train AUC: 0.7516
 Validation AUC: 0.6703
6
      Train AUC: 0.7528
 Validation AUC: 0.6719
7
      Train AUC: 0.7569
 Validation AUC: 0.6737
8
      Train AUC: 0.7565
 Validation AUC: 0.6749
9
      Train AUC: 0.7510
 Validation AUC: 0.6730
10
      Train AUC: 0.7524
 Validation AUC: 0.6754
11
      Train AUC: 0.7578
 Validation AUC: 0.6780
12
      Train AUC: 0.7590
 Validation AUC: 0.6778
13
      Train AUC: 0.7552
 Validation AUC: 0.6743
14
      Train AUC: 0.7512
 Validation AUC: 0.6719
15
      Train AUC: 0.7506
 Validation AUC: 0.6657
16
      Train AUC: 0.7593
 Validation AUC: 0.6819
17
      Train AUC: 0.7626
 Validation AUC: 0.6804
18
      Train AUC: 0.7591
 Validation AUC: 0.6755
19
      Train AUC: 0.7579
 Validation AU

In [6]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [7]:
bt = BaggedTree(bags)
bt.classes = np.unique(Y)

probs = bt.predictSoft(Xte)

In [8]:
print(probs)
print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))

[[ 0.25345816  0.74654184]
 [ 0.56905768  0.43094232]
 [ 0.80761937  0.19238063]
 ..., 
 [ 0.77019973  0.22980027]
 [ 0.77559865  0.22440135]
 [ 0.77336018  0.22663982]]
      Train AUC: 0.8692
 Validation AUC: 0.7592


# Linear Classifier

Used polynomial feature expansion to transform the dataset from 14 features to 119 features.

In [9]:
XtrSP = ml.transforms.fpoly(XtrS, 2, False)

# Rescale the data matrix so that the features have similar ranges / variance
XtrSP, XtrSP_params = ml.transforms.rescale(XtrSP)
# "params" returns the transformation parameters (shift & scale)

XvSP = ml.transforms.fpoly(XvS, 2, False)

XvSP, XvSP_params = ml.transforms.rescale(XvSP)

XteP = ml.transforms.fpoly(Xte, 2, False)

XteP, XteP_params = ml.transforms.rescale(XteP)

In [20]:
XtrSP.shape

(160000, 119)

In [10]:
linear = ml.linearC.linearClassify()

linear.train(XtrSP, Ytr, reg=0.1, initStep=0.01, stopTol=1e-6, stopIter=100)

In [11]:
probs = linear.predictSoft(XteP)
print(probs)
print("{0:>15}: {1:.4f}".format('Train AUC', linear.auc(XtrSP, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', linear.auc(XvSP, Yva)))

[[ 0.65037406  0.34962594]
 [ 0.5861603   0.4138397 ]
 [ 0.64398876  0.35601124]
 ..., 
 [ 0.68422899  0.31577101]
 [ 0.64840227  0.35159773]
 [ 0.66142157  0.33857843]]
      Train AUC: 0.6586
 Validation AUC: 0.6541


In [12]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

clf = AdaBoostClassifier(n_estimators=100)

clf.fit(Xtr, Ytr)

ada_x_train = clf.predict(Xtr)
ada_x_val = clf.predict(Xva)

In [34]:
probabilities = clf.staged_predict_proba(Xva)

[[ 0.49013187  0.50986813]
 [ 0.72193886  0.27806114]
 [ 0.72193886  0.27806114]
 ..., 
 [ 0.49013187  0.50986813]
 [ 0.49013187  0.50986813]
 [ 0.72193886  0.27806114]]
[[ 0.48299413  0.51700587]
 [ 0.63932589  0.36067411]
 [ 0.60557248  0.39442752]
 ..., 
 [ 0.48299413  0.51700587]
 [ 0.48299413  0.51700587]
 [ 0.60557248  0.39442752]]
[[ 0.4854282   0.5145718 ]
 [ 0.59114219  0.40885781]
 [ 0.56780193  0.43219807]
 ..., 
 [ 0.4854282   0.5145718 ]
 [ 0.4854282   0.5145718 ]
 [ 0.56780193  0.43219807]]
[[ 0.47782912  0.52217088]
 [ 0.57544402  0.42455598]
 [ 0.53982858  0.46017142]
 ..., 
 [ 0.47782912  0.52217088]
 [ 0.49596345  0.50403655]
 [ 0.55780235  0.44219765]]
[[ 0.48474681  0.51525319]
 [ 0.56297384  0.43702616]
 [ 0.53436666  0.46563334]
 ..., 
 [ 0.48474681  0.51525319]
 [ 0.49926111  0.50073889]
 [ 0.52782732  0.47217268]]
[[ 0.47806717  0.52193283]
 [ 0.55601543  0.44398457]
 [ 0.53213296  0.46786704]
 ..., 
 [ 0.47806717  0.52193283]
 [ 0.49015355  0.50984645]
 [ 0.526

 [ 0.50285764  0.49714236]]
[[ 0.4991404   0.5008596 ]
 [ 0.50540281  0.49459719]
 [ 0.50152786  0.49847214]
 ..., 
 [ 0.49921457  0.50078543]
 [ 0.49882074  0.50117926]
 [ 0.50309859  0.49690141]]
[[ 0.49927243  0.50072757]
 [ 0.50543543  0.49456457]
 [ 0.50162199  0.49837801]
 ..., 
 [ 0.49910997  0.50089003]
 [ 0.49872239  0.50127761]
 [ 0.50293234  0.49706766]]
[[ 0.49923363  0.50076637]
 [ 0.50530035  0.49469965]
 [ 0.50154648  0.49845352]
 ..., 
 [ 0.49907372  0.50092628]
 [ 0.49869219  0.50130781]
 [ 0.50283637  0.49716363]]
[[ 0.49929815  0.50070185]
 [ 0.50527153  0.49472847]
 [ 0.50157542  0.49842458]
 ..., 
 [ 0.49879184  0.50120816]
 [ 0.49841618  0.50158382]
 [ 0.50284546  0.49715454]]
[[ 0.49941691  0.50058309]
 [ 0.50506453  0.49493547]
 [ 0.5014244   0.4985756 ]
 ..., 
 [ 0.49891827  0.50108173]
 [ 0.49854831  0.50145169]
 [ 0.5026752   0.4973248 ]]
[[ 0.49929781  0.50070219]
 [ 0.50508638  0.49491362]
 [ 0.50150058  0.49849942]
 ..., 
 [ 0.49880661  0.50119339]
 [ 0.49

In [13]:
probs = clf.predict_proba(Xte)

print("{0:>15}: {1:.4f}".format('Train AUC', metrics.roc_auc_score(ada_x_train, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', metrics.roc_auc_score(ada_x_val, Yva)))

      Train AUC: 0.6664
 Validation AUC: 0.6617


In [14]:
class Ensemble(ml.base.classifier):
    def __init__(self, learners):
        self.learners = learners
    
    def predictSoft(self,X):
        BTprobs = self.learners[0].predictSoft(Xte)
        LCprobs = self.learners[1].predictSoft(XteP) 
        ADprobs = self.learners[2].predict_proba(Xte) 
        BTprobs = BTprobs * 0.4
        LCprobs = LCprobs * 0.3
        ADprobs = ADprobs * 0.3
        return BTprobs + LCprobs + ADprobs

(160000,)

In [17]:
ensemble = Ensemble([bt, linear, clf])
ensemble.classes = np.unique(Y)

print("{0:>15}: {1:.4f}".format('Train AUC', ensemble.auc(Xtr, Ytr)))

IndexError: index 161349 is out of bounds for axis 1 with size 160000

# Ensemble

In [None]:
Yte = np.vstack((np.arange(Xte.shape[0]), ensemble.predictSoft(Xte)[:,1])).T
np.savetxt('Y_submit.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

# Linear classifier Y_submit

In [23]:
Yte = np.vstack((np.arange(XteP.shape[0]), linear.predictSoft(XteP)[:,1])).T
np.savetxt('Y_submit.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

# Adaboosted Decision Stump Y_submit

In [41]:
list(clf.staged_predict_proba(Xte))

[array([[ 0.72193886,  0.27806114],
        [ 0.72193886,  0.27806114],
        [ 0.72193886,  0.27806114],
        ..., 
        [ 0.72193886,  0.27806114],
        [ 0.72193886,  0.27806114],
        [ 0.72193886,  0.27806114]]), array([[ 0.60557248,  0.39442752],
        [ 0.60557248,  0.39442752],
        [ 0.63932589,  0.36067411],
        ..., 
        [ 0.63932589,  0.36067411],
        [ 0.63932589,  0.36067411],
        [ 0.63932589,  0.36067411]]), array([[ 0.56780193,  0.43219807],
        [ 0.56780193,  0.43219807],
        [ 0.59114219,  0.40885781],
        ..., 
        [ 0.59114219,  0.40885781],
        [ 0.59114219,  0.40885781],
        [ 0.59114219,  0.40885781]]), array([[ 0.55780235,  0.44219765],
        [ 0.55780235,  0.44219765],
        [ 0.57544402,  0.42455598],
        ..., 
        [ 0.57544402,  0.42455598],
        [ 0.57544402,  0.42455598],
        [ 0.57544402,  0.42455598]]), array([[ 0.52782732,  0.47217268],
        [ 0.54878427,  0.45121573],
    

In [None]:
for i in range(clf.staged_predict_proba(Xte)):
    Yhat = learner[i].predict(X)
    e = wts.dot( Y != Yhat ) # compute weighted error rate
    alpha[i] = 0.5 * np.log( (1-e)/e )
    wts *= np.exp( -alpha[i] * Y * Yhat ) # update weights
    wts /= wts.sum() # and normalize them

In [80]:
predict = np.zeros((Xtr.shape[0], ))

#my_array = [[] for x in range(Xte.shape[0])]
for i, el in enumerate(clf.staged_predict_proba(Xtr)):
    print(el.shape)
    predict += el # compute contribution of each
predict = np.sign(predict) 

(160000, 2)


ValueError: operands could not be broadcast together with shapes (160000,) (160000,2) (160000,) 

In [61]:
my_array

[array([ 0.27806114,  0.27806114,  0.27806114, ...,  0.27806114,
         0.27806114,  0.27806114]),
 array([ 0.39442752,  0.39442752,  0.36067411, ...,  0.36067411,
         0.36067411,  0.36067411]),
 array([ 0.43219807,  0.43219807,  0.40885781, ...,  0.40885781,
         0.40885781,  0.40885781]),
 array([ 0.44219765,  0.44219765,  0.42455598, ...,  0.42455598,
         0.42455598,  0.42455598]),
 array([ 0.47217268,  0.45121573,  0.43702616, ...,  0.43702616,
         0.43702616,  0.43702616]),
 array([ 0.47331876,  0.45583887,  0.44398457, ...,  0.44398457,
         0.44398457,  0.44398457]),
 array([ 0.47869944,  0.46369067,  0.45349709, ...,  0.45349709,
         0.45349709,  0.45349709]),
 array([ 0.47998176,  0.46684238,  0.45791218, ...,  0.45791218,
         0.45791218,  0.45791218]),
 array([ 0.48444231,  0.47275105,  0.46479899, ...,  0.46479899,
         0.46479899,  0.46479899]),
 array([ 0.48505261,  0.47452829,  0.46736764, ...,  0.46736764,
         0.46736764,  0.46

In [63]:
Yte = np.vstack((np.arange(Xte.shape[0]), my_array)).T
np.savetxt('AdaBoost/Y_submitTEST.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

TypeError: Mismatch between array dtype ('object') and format specifier ('%d, %.2f')

In [141]:
print(ensemble.predictSoft(Xte))
ensemble_train_auc = (bt.auc(Xtr, Ytr) + linear.auc(XtrSP, Ytr) + metrics.roc_auc_score(ada_x_train, Ytr)) / 3
ensemble_validation_auc = (bt.auc(Xva, Yva) + linear.auc(XvSP, Yva) + metrics.roc_auc_score(ada_x_val, Yva)) / 3
print(ensemble_train_auc)
print(ensemble_validation_auc)

0.724598604147
0.726019655098
