In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[2]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [4]:
num_round = 10

In [5]:
adaboost = pb.run_adtree_adaboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.035073381051992453, 0.033886876651154696, 0.034431093094229717, 0.035577364378594115, 0.035795864140282019, 0.035747422211385542, 0.035778854972746701, 0.035822595572504057, 0.035867295154751719, 0.035834878774180803, 0.035766499244184527, 0.035830004871849508, 0.035901379173081296, 0.035810015119475597, 0.035949466005034625, 0.036133391064628512, 0.036397432535441046, 0.034487835128538084, 0.035853561415687921, 0.035770546402386458, 0.035811892586235052, 0.035732727009976067, 0.035591090295171275, 0.035837827226705352, 0.035806581624570386, 0.035029968723788454, 0.035568810643906379, 0.035629598045425107]
Purity (farther from 1.0 is better): (0.69681658713118133, 1.0006914618718552)
Predicts (farther from 0.0 is better): (-0.1806165245019461, 0.00034561146111922175)
Split node: 0 (left)
Split index and value: 1 -2.30933403969 

=== Iteration 2 ===
Score (sorted by index): [0.03561954365445389, 0.035752066796703834, 0.035787313406377398, 

In [6]:
logitboost = pb.run_adtree_logitboost(y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.035717859374825363, 0.035792417299955942, 0.035773104851018284, 0.035589193583073879, 0.035760089983325853, 0.035800210056666504, 0.035838296225483929, 0.035860481127550352, 0.035836901063425318, 0.035955531718611322, 0.035109793302665365, 0.033903274617938517, 0.034412487436619565, 0.035829613637107842, 0.03576818552278254, 0.035788736152340316, 0.035765561964729735, 0.035782199603843323, 0.035832974506124875, 0.035830713970631645, 0.035799633735578498, 0.035790750185410163, 0.035751322873898879, 0.035804377015704124, 0.035837381527957034, 0.035426651540582429, 0.036101321302710467, 0.034087953379577839]
Purity (farther from 1.0 is better): (1.0590471281532357, 1.0663387949663856)
Predicts (farther from 0.0 is better): (0.028684784067239195, 0.032115547083864486)
Split node: 0 (left)
Split index and value: 11 0.598560869694 

=== Iteration 2 ===
Score (sorted by index): [0.035670732103512612, 0.035785165758545137, 0.035801919770476531, 0

# Performance

In [7]:
from pyboost.utils import safe_comp

def test_error(root, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = root.run(tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(root):
    train_accuracy, train_margin = test_error(root, y_train, X_train)
    test_accuracy, test_margin = test_error(root, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(root):
    for max_index in range(num_round):
        accuracy, margin = test_error(root, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [8]:
root = adaboost[0]
test_all(root)

Train accuracy: 0.6246 	Train margin: 0.0789087824895
Test accuracy: 0.6265 	Test margin: 0.077965974383


In [9]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.53 0.00361140856615
2 0.60512 0.0496783956979
3 0.60533 0.0668664219161
4 0.60531 0.0678749506508
5 0.60531 0.0679002789284
6 0.60531 0.0678996717374
7 0.60531 0.0678990661547
8 0.60531 0.0678984396604
9 0.62077 0.0729109834044


# Performance of LogitBoost

In [10]:
root = logitboost[0]
test_all(root)

Train accuracy: 0.60724 	Train margin: 0.0652522713828
Test accuracy: 0.6016 	Test margin: 0.0611882294666


In [12]:
test_margin(root)

0 0.52977 0.00354920956848
1 0.52977 0.00532640302478
2 0.52977 0.00574392252183
3 0.52977 0.00593737428426
4 0.52977 0.00595466358865
5 0.60724 0.0497868658974
6 0.60724 0.0489116802182
7 0.60724 0.0486700041608
8 0.60724 0.0484584697278
9 0.60724 0.0481982798356
