In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[2]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [4]:
num_round = 10

In [5]:
adaboost = pb.run_adtree_adaboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.034291472710074669, 0.035838493333371371, 0.035786932145015354, 0.035638990711589727, 0.035769502246153051, 0.035681198830662106, 0.035839238856249601, 0.035846395320328343, 0.035806773638592762, 0.035805981199485033, 0.035910090001833275, 0.035793151447080944, 0.035934294701582964, 0.036113736531559901, 0.036354519565657342, 0.034490106267436357, 0.035876817907725886, 0.035791971276231072, 0.035814011607367621, 0.035751005250258164, 0.035807521456132259, 0.035861022991627356, 0.035683884857959133, 0.035832426980113574, 0.035774520204820734, 0.035011183948477492, 0.034859529885475311, 0.033363577017772213]
Purity (farther from 1.0 is better): (1.2768373971341336, 0.64229072760219508)
Predicts (farther from 0.0 is better): (0.12219311850992583, -0.22135711553196086)
Split node: 0 (left)
Split index and value: 27 0.964190125465 

=== Iteration 2 ===
Score (sorted by index): [0.035504559615011623, 0.035696455556026574, 0.035860874163414935, 

In [6]:
logitboost = pb.run_adtree_logitboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.035661735698383809, 0.035888266718936204, 0.03580018268964285, 0.035730652538289123, 0.036110853003618187, 0.036262049634507902, 0.034459292936591328, 0.035848987638750753, 0.035792397771473597, 0.035782471192723826, 0.035697541610875981, 0.035780388120869901, 0.035801979453222103, 0.035838296225483929, 0.035822453390807293, 0.035814994201609066, 0.035922945584170897, 0.035136340090956945, 0.033885012369925638, 0.03437853369300297, 0.035816045577176693, 0.03574426913143925, 0.0356690862838751, 0.03575317273408378, 0.035720461658614602, 0.034952729734464782, 0.035606740761393772, 0.035188884617031381]
Purity (farther from 1.0 is better): (0.93811593847940422, 1.0665496923214026)
Predicts (farther from 0.0 is better): (-0.031940867911787338, 0.03221442583770525)
Split node: 0 (left)
Split index and value: 18 -1.84599709511 

=== Iteration 2 ===
Score (sorted by index): [0.035551697770151817, 0.035817491552722291, 0.035788238311571907, 0.035

# Performance

In [7]:
from pyboost.utils import safe_comp

def test_error(root, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = root.run(tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(root):
    train_accuracy, train_margin = test_error(root, y_train, X_train)
    test_accuracy, test_margin = test_error(root, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(root):
    for max_index in range(num_round):
        accuracy, margin = test_error(root, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [8]:
root = adaboost[0]
test_all(root)

Train accuracy: 0.63498 	Train margin: 0.105914429769
Test accuracy: 0.6297 	Test margin: 0.104557587972


In [10]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.58647 0.0303038861147
2 0.58647 0.0305738240523
3 0.58647 0.0338858581377
4 0.58647 0.0385544808866
5 0.60236 0.0774346478012
6 0.60236 0.0738613681354
7 0.62873 0.102816263005
8 0.62888 0.101764521871
9 0.63171 0.104907639513


# Performance of LogitBoost

In [11]:
root = logitboost[0]
test_all(root)

Train accuracy: 0.61198 	Train margin: 0.131058759741
Test accuracy: 0.6098 	Test margin: 0.125913335665


In [12]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.52977 0.00547238890636
2 0.60797 0.0537612775667
3 0.60797 0.0702106558294
4 0.60797 0.0776015215904
5 0.60797 0.0874733296176
6 0.60797 0.116288067137
7 0.60797 0.130081056569
8 0.61186 0.131151331678
9 0.61198 0.131156323456
