In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[2]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [4]:
num_round = 10

In [5]:
adaboost = pb.run_adtree_adaboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [3565.851769174149, 3576.7447313493267, 3589.5854898250582, 3489.7813351581408, 3382.8264530034321, 3425.5181503697604, 3572.9517209166233, 3572.8811903002133, 3577.4576283893639, 3571.1574461935097, 3578.6230792035631, 3575.1811142935221, 3575.1065082653749, 3572.2581093757908, 3575.4454422819381, 3583.7935208380895, 3577.3008231710537, 3584.2932570775556, 3608.6814818379862, 3630.4462535618331, 3442.5697701492545, 3578.171600133378, 3541.8853577211294, 3572.8058200786691, 3559.5045722074692, 3467.84662464334, 3549.8770110700843, 3536.3637270314985]
Purity (farther from 1.0 is better): (1.0000180579568339, 0.99971425139671377)
Predicts (farther from 0.0 is better): (9.0288968954552238e-06, -0.00014289471859868809)
Split node: 0 (left)
Split index and value: 4 1.53461372852 

=== Iteration 2 ===
Score (sorted by index): [3562.8375415415544, 3578.7660652279824, 3579.6375297389277, 3562.4452522175898, 3586.3838655957411, 3498.0741692678939, 33

In [6]:
logitboost = pb.run_adtree_logitboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [1782.7246526737249, 1786.4459353459956, 1785.4820259900782, 1776.3027762531883, 1784.8324370759465, 1786.834881933044, 1788.7358119793853, 1789.8430891937285, 1788.6661776328697, 1794.5871873816309, 1752.3752869427485, 1692.15637570839, 1717.5718474418311, 1788.3024527406751, 1785.236495942265, 1786.2622044976395, 1785.1055507596457, 1785.9359568906164, 1788.4701980690886, 1788.3573718129098, 1786.8061170006067, 1786.3627274983933, 1784.3948592820457, 1787.042860259093, 1788.6901582403398, 1768.1900922537191, 1801.8637344711087, 1701.3739320502882]
Purity (farther from 1.0 is better): (1.0590471281534208, 1.0663387949660452)
Predicts (farther from 0.0 is better): (0.028684784067326625, 0.032115547083704878)
Split node: 0 (left)
Split index and value: 11 0.598560869694 

=== Iteration 2 ===
Score (sorted by index): [889.88620116719449, 895.51896963363174, 893.32523302765458, 891.56899768236406, 900.4648145934766, 904.82697571447375, 859.7638

# Performance

In [7]:
from pyboost.utils import safe_comp

def test_error(root, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = root.run(tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(root):
    train_accuracy, train_margin = test_error(root, y_train, X_train)
    test_accuracy, test_margin = test_error(root, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(root):
    for max_index in range(num_round):
        accuracy, margin = test_error(root, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [8]:
root = adaboost[0]
test_all(root)

Train accuracy: 0.61165 	Train margin: 0.092772066083
Test accuracy: 0.608 	Test margin: 0.0892120812628


In [8]:
root = adaboost[0]
test_all(root)

Train accuracy: 0.58736 	Train margin: 0.0375677263994
Test accuracy: 0.5905 	Test margin: 0.037376399982


In [9]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.56824 0.0187618353665
2 0.59188 0.0562917062673
3 0.59188 0.0562904209758
4 0.60463 0.0614735045212
5 0.60395 0.062107909046
6 0.60377 0.062658409172
7 0.60377 0.062658113404
8 0.61154 0.0753407894486
9 0.61165 0.0880812215383


# Performance of LogitBoost

In [10]:
root = logitboost[0]
test_all(root)

Train accuracy: 0.56817 	Train margin: 0.0243883806137
Test accuracy: 0.5718 	Test margin: 0.0255453557149


In [9]:
root = logitboost[0]
test_all(root)

Train accuracy: 0.62048 	Train margin: 0.0818729495842
Test accuracy: 0.6215 	Test margin: 0.0813688262755


In [11]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.52977 0.00594987799229
2 0.52977 0.0060245020481
3 0.52977 0.00619948519181
4 0.56824 0.0220209260741
5 0.56824 0.0222631843959
6 0.56824 0.0221350991257
7 0.56824 0.0220740165613
8 0.56817 0.0245168084998
9 0.56817 0.0244557794548
