In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[2]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [6]:
num_round = 2

In [7]:
adaboost = pb.run_adtree_adaboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.033813494310875568, 0.034383807182310946, 0.03584334822223581, 0.035682624369004032, 0.035832752187687333, 0.03569841404789309, 0.035834842328205191, 0.03581408321754289, 0.035877584949230261, 0.035796852232092335, 0.03581502473318067, 0.035899307689725726, 0.035832401118309265, 0.035906605673839083, 0.036148764630287017, 0.036379670737751876, 0.034498888378732009, 0.03577956473330858, 0.035778137350230485, 0.035770657203779935, 0.035763558792010042, 0.035760654984204325, 0.035723862452132299, 0.035892326092748034, 0.035581206558438494, 0.035010527311792217, 0.035755297158179156, 0.034542731887765903]
Purity (farther from 1.0 is better): (1.0107805254144979, 0.16448228407180704)
Predicts (farther from 0.0 is better): (0.0053614149195382573, -0.90247620509420723)
Split node: 0 (left)
Split index and value: 0 3.08589935303 

=== Iteration 2 ===
Score (sorted by index): [0.035863830418964751, 0.035828475791179988, 0.035884249588414414, 0.035

In [6]:
logitboost = pb.run_adtree_logitboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.035681736065408046, 0.035771104825925944, 0.035713051437891212, 0.035588603908622327, 0.035837921324759464, 0.035808844689506424, 0.035849672396584181, 0.035798415830882564, 0.035975289660446046, 0.035143430398527895, 0.033861731031651555, 0.034419536817600245, 0.035804143483434608, 0.035701358828897321, 0.035798136836782027, 0.03575394002603946, 0.035811593638018328, 0.035777935819035697, 0.035790427709872202, 0.035773850306521117, 0.035833000719704425, 0.035871485841252167, 0.035507942676397723, 0.035916553434073455, 0.036025265956819208, 0.035538069132805869, 0.034308164707695481, 0.035375190275914382]
Purity (farther from 1.0 is better): (0.95490126954189525, 1.0769314697610783)
Predicts (farther from 0.0 is better): (-0.023073663262203996, 0.037057882736463314)
Split node: 0 (left)
Split index and value: 10 -1.15781712532 

=== Iteration 2 ===
Score (sorted by index): [0.035565519087329384, 0.035769210326777393, 0.035807893793526155,

# Performance

In [7]:
from pyboost.utils import safe_comp

def test_error(root, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = root.run(tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(root):
    train_accuracy, train_margin = test_error(root, y_train, X_train)
    test_accuracy, test_margin = test_error(root, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(root):
    for max_index in range(num_round):
        accuracy, margin = test_error(root, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [8]:
root = adaboost[0]
test_all(root)

Train accuracy: 0.6077 	Train margin: 0.055289514773
Test accuracy: 0.6053 	Test margin: 0.0532228308711


In [9]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.52977 0.00547927962474
2 0.54087 0.00716078329135
3 0.54073 0.00716646109766
4 0.54089 0.00722870184687
5 0.54099 0.00725743407153
6 0.60802 0.0521567216513
7 0.60802 0.0521832190692
8 0.60805 0.0546423678676
9 0.60771 0.0552871127252


# Performance of LogitBoost

In [10]:
root = logitboost[0]
test_all(root)

Train accuracy: 0.6073 	Train margin: 0.0841431034995
Test accuracy: 0.6032 	Test margin: 0.0792539368424


In [11]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.52977 0.00570692935435
2 0.52977 0.0155873651293
3 0.52977 0.0205804643478
4 0.52977 0.0226580324018
5 0.52977 0.0234525919514
6 0.52977 0.0239918682151
7 0.52977 0.0242313503385
8 0.52977 0.0243536160235
9 0.6073 0.0723778530759
