In [1]:
#general
import copy
import random

#math & plot
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

#sklearn
import sklearn.metrics as metrics
from sklearn.externals import joblib
from sklearn import cross_validation as cv

#EF applier
import _matrixnetapplier_untouched as mnet

#persistance
import cPickle
from StringIO import StringIO

#debug purposes
%load_ext autoreload
%autoreload 2

In [2]:
#extract model
with open('../formula/MSLR10k_ef.mx', 'r') as f:
    formula = mnet.MatrixnetClassifier(StringIO(cPickle.load(f)))

depth, nTrees, itr = formula.iterate_trees().next()
trees = [tree for tree in itr]
print len(trees), '==',nTrees

9997 == 9997


In [3]:
def load_csv(name):
    #cannot install h5py with obsolete dependencies, upgrading them might cause OS instability
    print "reading from",name
    qids = np.loadtxt(name+".qids.csv",delimiter=',')
    labels = np.loadtxt(name+".labels.csv",delimiter=',')
    features = np.loadtxt(name+".features.csv",delimiter=',')
    print "done"
    return features, qids, labels

In [4]:
Xtr,Qtr,Ytr = load_csv("../data/MSLR10/mslr_train")
Xts,Qts,Yts = load_csv("../data/MSLR10/mslr_test")

reading from ../data/MSLR10/mslr_train
done
reading from ../data/MSLR10/mslr_test
done


In [5]:
from factory import RegressionFactory
trainFactory = RegressionFactory(Xtr,Ytr)
testFactory = RegressionFactory(Xts,Yts)

In [6]:
#from formula import PrunedFormula as pf
class PrunedFormula(list):
    #just a list with bias
    def __init__(self,trees,bias = 0):
        self.bias = bias
        list.__init__(self,trees)
    def __repr__(self):
        return str(self.bias)+' '+list.__repr__(self)
    def predict(self,factory):
        return factory.predict(self)
    def __add__(self,other):
        assert type(other) is list
        return PrunedFormula(list(self)+other,self.bias)
    def staged_predict(self,factory):
        return factory.apply_separately(self)

#training errors

In [7]:
trees_full = PrunedFormula(trees,bias = 0.0)
pred_zero = trees_full.predict(trainFactory) 

avg_labels = np.average(trainFactory.labels)
print 'avg of labels = ',np.average(trainFactory.labels)


avg of labels =  0.665306906714


In [8]:
print 'in-sample error with that as bias:', metrics.mean_squared_error(trainFactory.labels, pred_zero + avg_labels)
print 'in-sample error with zero bias',  metrics.mean_squared_error(trainFactory.labels, pred_zero + 0.0 )

in-sample error with that as bias: 0.949492985327
in-sample error with zero bias 0.506859705662


In [9]:
#sanity check
pred_vanilla = formula.apply(Xtr)
print metrics.mean_squared_error(Ytr,pred_vanilla)

0.506859705662


In [10]:
#local optimum
print 'in-sample error with zero bias+0.1',  metrics.mean_squared_error(trainFactory.labels, pred_zero + 0.1 )
print 'in-sample error with zero bias    ',  metrics.mean_squared_error(trainFactory.labels, pred_zero + 0.0 )
print 'in-sample error with zero bias-0.1',  metrics.mean_squared_error(trainFactory.labels, pred_zero - 0.1 )

in-sample error with zero bias+0.1 0.516859705593
in-sample error with zero bias     0.506859705662
in-sample error with zero bias-0.1 0.516859705731


In [11]:
#maybe fake trees.. no
print trees[0]

(array([ 10,  40,  54,  85, 109, 114], dtype=int32), array([  3.74050000e+03,   1.38895938e+04,   5.40750008e-03,
         1.80319788e+03,  -4.29138517e+00,  -2.99575043e+01], dtype=float32), array([ 0.00354839,  0.01      ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.00175404,  0.0005208 ,  0.00033333,  0.00041667,
        0.0075    ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0009901 ,  0.        ,  0.        ,
        0.        ,  0.00450704,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.0049011 ,  0.00220257,
        0.00179039,  0.00105263,  0.00859768,  0.00765957,  0.00244444,
        0.002857

In [12]:
print trees[-1]

(array([ 15,  18,  50,  64, 121, 127], dtype=int32), array([  5.24100018e+00,   1.40037956e+01,   2.10645013e-02,
         4.46384996e-02,  -1.47469349e+01,   2.65000000e+01], dtype=float32), array([ -3.69504294e-05,  -3.23407926e-04,   9.80706050e-05,
        -4.19547761e-06,  -2.23478776e-03,   8.38593842e-05,
         3.82297241e-04,  -1.00824729e-04,  -3.26409230e-03,
         4.76749180e-04,   2.76161266e-04,   7.54881292e-05,
        -1.93530043e-03,   1.31143542e-03,   1.16104901e-03,
        -6.52607533e-05,   4.34667704e-05,   7.09076759e-05,
        -4.27774548e-04,  -2.04795704e-06,   1.03463501e-05,
         1.38564279e-04,   6.43983699e-04,   6.73037852e-05,
         8.84098043e-05,   1.94178283e-04,  -2.68877388e-04,
        -5.26592072e-05,   3.46758229e-06,   3.04152572e-05,
        -4.27581505e-04,  -1.45885337e-04,  -9.97970761e-04,
        -6.07318585e-04,   1.20987550e-05,  -2.75745581e-06,
        -2.92360378e-03,  -1.25605376e-03,   1.52252043e-05,
         7.9648