In [1]:
import pickle

from pyboost import controllers as pb
from pyspark import SparkContext

sc = SparkContext(master="local[2]")

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open("../experiments/higgs/data/higgs-data-1p.pickle"))

In [3]:
X = sc.parallelize(X_train)
y = sc.parallelize([y + y - 1 for y in  y_train])

In [4]:
num_round = 10

In [5]:
adaboost = pb.run_adtree_adaboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.9969926595430324, 0.99863620190558489, 0.99853220747575011, 0.99424008684579512, 0.99869017878872046, 0.99701387208426484, 0.9989750758279522, 0.99840136137359969, 0.99916246113569862, 0.99812604216472278, 0.99844268574043793, 0.9989846029942161, 0.99797038014459105, 0.99846949788627093, 0.99854903050157218, 0.99889548341670109, 0.9995938929889383, 0.99792682451060921, 0.99919928229157173, 0.99824205105278296, 0.99996322431539619, 0.99821602213215299, 0.99494948596449817, 0.99888371916788976, 0.99520284405808679, 0.9797591152634012, 0.99233152757315979, 0.98430478436071411]
Min score: 0.979759115263
Purity (farther from 1.0 is better): (1.3553128049669718, 0.56049457424717108)
Predicts (farther from 0.0 is better): (0.15201614003175204, -0.28946785835903821)
Split node: 0 (left)
Split index and value: 25 0.995424747467 

=== Iteration 2 ===
Score (sorted by index): [0.99679959434815346, 0.99772612263687499, 0.99859289781450455, 0.99734976

In [8]:
logitboost = pb.run_adtree_logitboost(sc, y, X, T=num_round, quiet=False)

=== Iteration 1 ===
Score (sorted by index): [0.99457455462809874, 0.9992027170680694, 0.99747774785329568, 0.99282811139918703, 0.99833348281764678, 0.99478083120169081, 0.9975311819470537, 0.99915739875018506, 0.9988432535755889, 0.99854960103612211, 0.99686044608937319, 0.99852474065850283, 0.99812489063190235, 0.99835175348835514, 0.99816670201893964, 0.9978074465036797, 0.99795604769916924, 0.99815642182942999, 0.99845926936942275, 0.99595550120273113, 0.99863804882348628, 0.99753338562355875, 0.99505392510955126, 0.9978816138310791, 0.99633779687694057, 0.97461538211441812, 0.99285170333338257, 0.98194760143874382]
Min score: 0.974615382114
Purity (farther from 1.0 is better): (1.4032316420944626, 0.52411631746729659)
Predicts (farther from 0.0 is better): (0.16938894616741476, -0.32302081969778318)
Split node: 0 (left)
Split index and value: 25 1.0768276453 

=== Iteration 2 ===
Score (sorted by index): [0.99346193239160474, 0.99489841397974343, 0.99329898018130547, 0.9930084521

# Performance

In [10]:
from pyboost.adtree import run_tree
from pyboost.utils import safe_comp

def test_error(nodes, y, X, max_index=None):    
    error = 0
    margin = 0.0
    for tX, ty in zip(X, y):
        ty = ty + ty - 1
        _m = run_tree(0, nodes, tX, max_index=max_index)
        predict = safe_comp(_m)
        if safe_comp(predict, ty):
            error += 1
        margin += _m * ty
    return 1.0 - float(error) / len(y), margin / len(y)


def test_all(nodes):
    train_accuracy, train_margin = test_error(nodes, y_train, X_train)
    test_accuracy, test_margin = test_error(nodes, y_test, X_test)
    print "Train accuracy:", train_accuracy, '\t',
    print "Train margin:", train_margin
    print "Test accuracy:", test_accuracy, '\t',
    print "Test margin:", test_margin


def test_margin(nodes):
    for max_index in range(num_round + 1):
        accuracy, margin = test_error(nodes, y_train, X_train, max_index=max_index)
        print max_index, accuracy, margin

## Performance of AdaBoost

In [None]:
test_all(adaboost)
test_margin(adaboost)

Train accuracy: 0.66026 	Train margin: 0.17679498566
Test accuracy: 0.6531 	Test margin: 0.168592628857
0 0.52977 0.00354920956848
1 0.60724 0.0468020159013
2 0.62567 0.0809744648444
3 0.62723 0.0981741554166


# Performance of LogitBoost

In [None]:
test_all(logitboost)
test_margin(logitboost)