In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[3]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [4]:
num_round = 10

In [5]:
adaboost = pb.run_adtree_adaboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Min score: 0.972018075261
Purity (farther from 1.0 is better): (1.3256888724673979, 0.49843948824910711)
Predicts (farther from 0.0 is better): (0.140966114042018, -0.3481365423065515)
Split node: 0 (left)
Split index and value: 25 1.06971287727 

=== Iteration 2 ===
Min score: 0.984263898491
Purity (farther from 1.0 is better): (0.51704332792388952, 1.280339226858884)
Predicts (farther from 0.0 is better): (-0.32981430077804191, 0.12356253190158577)
Split node: 1 (left)
Split index and value: 25 0.597122132778 

=== Iteration 3 ===
Min score: 0.992182479221
Purity (farther from 1.0 is better): (0.54947391953859592, 1.2505038484132525)
Predicts (farther from 0.0 is better): (-0.2993969842164928, 0.11177327441520166)
Split node: 2 (right)
Split index and value: 26 0.825517624617 

=== Iteration 4 ===
Min score: 0.985875407516
Purity (farther from 1.0 is better): (1.2937265066665697, 0.58689332099643554)
Predicts (farther from 0.0 is better): (0.12876340938418007, -0.

In [8]:
logitboost = pb.run_adtree_logitboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.99457455462809874, 0.9992027170680694, 0.99747774785329568, 0.99282811139918703, 0.99833348281764678, 0.99478083120169081, 0.9975311819470537, 0.99915739875018506, 0.9988432535755889, 0.99854960103612211, 0.99686044608937319, 0.99852474065850283, 0.99812489063190235, 0.99835175348835514, 0.99816670201893964, 0.9978074465036797, 0.99795604769916924, 0.99815642182942999, 0.99845926936942275, 0.99595550120273113, 0.99863804882348628, 0.99753338562355875, 0.99505392510955126, 0.9978816138310791, 0.99633779687694057, 0.97461538211441812, 0.99285170333338257, 0.98194760143874382]
Min score: 0.974615382114
Purity (farther from 1.0 is better): (1.4032316420944626, 0.52411631746729659)
Predicts (farther from 0.0 is better): (0.16938894616741476, -0.32302081969778318)
Split node: 0 (left)
Split index and value: 25 1.0768276453 

=== Iteration 2 ===
Score (sorted by index): [0.99346193239160474, 0.99489841397974343, 0.99329898018130547, 0.9930084521

# Performance

In [6]:
from pyboost.adtree import run_tree
from pyboost.utils import safe_comp

def test_error(nodes, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = run_tree(0, nodes, tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(nodes):
    train_accuracy, train_margin = test_error(nodes, y_train, X_train)
    test_accuracy, test_margin = test_error(nodes, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(nodes):
    for max_index in range(num_round + 1):
        accuracy, margin = test_error(nodes, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [7]:
test_all(adaboost)
test_margin(adaboost)

Train accuracy: 0.66237 	Train margin: 0.18171254852
Test accuracy: 0.6554 	Test margin: 0.17577454326
0 0.52977 0.00354920956848
1 0.61094 0.0516427946144
2 0.63479 0.0798423159376
3 0.63479 0.0968911992297
4 0.63678 0.111258865365
5 0.63816 0.124004737682
6 0.65063 0.144300736904
7 0.65887 0.156041353027
8 0.65741 0.167961583287
9 0.65784 0.175234811317
10 0.66237 0.18171254852


# Performance of LogitBoost

In [12]:
test_all(logitboost)
test_margin(logitboost)

Train accuracy: 0.64391 	Train margin: 0.277308308117
Test accuracy: 0.6407 	Test margin: 0.266371187466
0 0.52977 0.00354920956848
1 0.61089 0.0535789076303
2 0.61089 0.0950319156935
3 0.61089 0.131287217591
4 0.61089 0.170891795839
5 0.61089 0.19305184883
6 0.62958 0.199758238275
7 0.63312 0.222920250926
8 0.63885 0.250397959361
9 0.63885 0.268250095384
10 0.64391 0.277308308117
