In [1]:
#general
import copy
import random

#math & plot
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

#sklearn
import sklearn.metrics as metrics
from sklearn.externals import joblib
from sklearn import cross_validation as cv

#EF applier
import _matrixnetapplier as mnet

#persistance
import cPickle
from StringIO import StringIO

#debug purposes
%load_ext autoreload
%autoreload 2

# Extracting the trained model

In [2]:
with open('../formula/MSLR10k_ef.mx', 'r') as f:
    formula = mnet.MatrixnetClassifier(StringIO(cPickle.load(f))) #btw he's a regressor, not classifier

depth, nTrees, itr = formula.iterate_trees().next()
trees = [tree for tree in itr]
print len(trees), '==',nTrees

9997 == 9997


# Loading dataset

In [3]:
def save_as_h5(path_to_txt,output_name="mslr"):
    
    print "opening "+path_to_txt
    f = open(path_to_txt)
    labels = []
    features = []
    print "extracting..."
    for line in f:
        line = line[:line.find('#') - 1]#удалить комменты из конца линии
        ls = line.split()
        labels.append(int(ls[0]))
        features.append([float(x[x.find(':') + 1:]) for x in ls[1:]])
    f.close()
    print "converting & sorting..."
    labels = np.asarray(labels, dtype=np.int32)
    features = np.asarray(features)
    query = features[:, 0].astype(int)
    features = features[:, 1:]
    sorter = np.argsort(query)
    query,labels,features = query[sorter],labels[sorter],features[sorter]
    print "saving..."
    h5f = h5py.File(output_name, 'w')
    h5f.create_dataset('qids', data=query)
    h5f.create_dataset('labels', data=labels)
    h5f.create_dataset('features', data=features)
    h5f.close()
    print "done"
    return features,query,labels

In [4]:
def save_csv(path_to_txt,output_name = "mslr"):
    print "opening "+path_to_txt
    f = open(path_to_txt)
    labels = []
    features = []
    print "extracting..."
    for line in f:
        line = line[:line.find('#') - 1]#удалить комменты из конца линии
        ls = line.split()
        labels.append(int(ls[0]))
        features.append([float(x[x.find(':') + 1:]) for x in ls[1:]])
    f.close()
    print "converting & sorting..."
    labels = np.asarray(labels, dtype=np.int32)
    features = np.asarray(features)
    query = features[:, 0].astype(int)
    features = features[:, 1:]
    sorter = np.argsort(query)
    query,labels,features = query[sorter],labels[sorter],features[sorter]
    print "saving..."
    np.savetxt(output_name+".qids.csv",query,delimiter=',')
    np.savetxt(output_name+".lavels.csv",labels,delimiter=',')
    np.savetxt(output_name+".features.csv",features,delimiter=',')
    print "done"
    return features,query,labels

In [5]:
%%time
##warning! this can take a long time. no need to rerun that code if u have CSV files created once.
#save_csv("../data/MSLR10/Fold1/train.txt","../data/MSLR10/mslr_train")
#save_csv("../data/MSLR10/Fold1/test.txt","../data/MSLR10/mslr_test")
#save_csv("../data/MSLR10/Fold1/vali.txt","../data/MSLR10/mslr_vali")
print "converted that"

converted that
CPU times: user 32 µs, sys: 7 µs, total: 39 µs
Wall time: 42 µs


In [6]:
#load training set
def load_h5(name):
    print "reading from",name
    h5f = h5py.File(name,'r')
    labels = h5f['labels'][:]
    qids = h5f['qids'][:]
    features = h5f['features'][:]
    h5f.close()
    print "done"
    return features, qids, labels

In [7]:
def load_csv(name):
    print "reading from",name
    qids = np.loadtxt(name+".qids.csv",delimiter=',')
    labels = np.loadtxt(name+".lavels.csv",delimiter=',')
    features = np.loadtxt(name+".features.csv",delimiter=',')
    print "done"
    return features, qids, labels

In [8]:
Xtr,Qtr,Ytr = load_csv("../data/MSLR10/mslr_train")
Xts,Qts,Yts = load_csv("../data/MSLR10/mslr_test")

reading from ../data/MSLR10/mslr_train
done
reading from ../data/MSLR10/mslr_test
done


In [9]:
from factory import RegressionFactory
#s_ind = np.arange(1000)
#trainFactory = RegressionFactory(Xtr[s_ind],Ytr[s_ind])
#DataFactory is just a data wrapper that can handle splits, predictions, etc. 
#Used to avoid recomputing metadata at each predictions and passing large argument strings
trainFactory = RegressionFactory(Xtr,Ytr)
testFactory = RegressionFactory(Xts,Yts)

In [10]:
print "train: ",Xtr.shape,"qids:",len(set(Qtr))
print "test: ",Xts.shape,"qids:",len(set(Qts))
print "qid intersection:",len(set.intersection(set(Qtr),set(Qts))),"(must be 0)"

train:  (723412, 136) qids: 6000
test:  (241521, 136) qids: 2000
qid intersection: 0 (must be 0)


# greedy pruning for the whole data

In [11]:
import greedy
from loss_functions import MSELoss

In [12]:
%%time
res_greedy = greedy.greed_up_features_bfs(trees,trainFactory,
                                          loss = MSELoss,
                                          learning_rate = .003,
                                          learning_rate_decay=1.,# no decay
                                          nTrees =600,
                                          trees_sample_size =500, #chosen from the ensemble at random each iteration
                                          verbose = True,
                                          regularizer=0.0004, #added to gradient walker's leaf denominator
                                          use_joblib=True,
                                          n_jobs=-1,
                                          joblib_method="threads" #every GIL-ly thing is copied anyways
                                          )


iteration # 0  ntrees =  1 
best loss =  806973.9353
learning_rate =  0.003
sample_size 500

iteration # 1  ntrees =  2 
best loss =  804820.873461 
last loss =  804820.873461
learning_rate =  0.003
sample_size 500

iteration # 2  ntrees =  3 
best loss =  802672.885476 
last loss =  802672.885476
learning_rate =  0.003
sample_size 500

iteration # 3  ntrees =  4 
best loss =  800541.538417 
last loss =  800541.538417
learning_rate =  0.003
sample_size 500

iteration # 4  ntrees =  5 
best loss =  798413.542072 
last loss =  798413.542072
learning_rate =  0.003
sample_size 500

iteration # 5  ntrees =  6 
best loss =  796292.841919 
last loss =  796292.841919
learning_rate =  0.003
sample_size 500

iteration # 6  ntrees =  7 
best loss =  794202.781225 
last loss =  794202.781225
learning_rate =  0.003
sample_size 500

iteration # 7  ntrees =  8 
best loss =  792111.511678 
last loss =  792111.511678
learning_rate =  0.003
sample_size 500

iteration # 8  ntrees =  9 
best loss =  7900

KeyboardInterrupt: 

In [None]:
y_pred_stupid = greedy.predict(testFactory,trees[:600])
y_pred_full = greedy.predict(testFactory,trees)

In [None]:
y_pred_greedy = testFactory.predict(res_greedy)

In [None]:
print metrics.mean_squared_error(Yts,y_pred_greedy),
print metrics.mean_squared_error(Yts,y_pred_stupid),
print metrics.mean_squared_error(Yts,y_pred_full)
print "well..."

#hierarchy

In [None]:
#usability distribution
thresholds = mnet.get_thresholds(trees,formula.feature_ids,0.001,use_joblib = True,b=n_jobs = 4)

In [None]:
plt.scatter(range(len(thresholds)),thresholds[:,2])
print sum(thresholds[:,2] >150)

# Criteria selection

In [None]:
import hierarchy

In [None]:
#get them...
thresholds_active = thresholds[thresholds[:,2]>100] #at least 100 times used in the original ensemble
print len(thresholds_active)
criteria = hierarchy.select_criteria(trainFactory,thresholds_active,4,True)

In [None]:
criteria

In [None]:
#split = hierarchy.split_upper(trainFactory,criteria,equalizeWeights=False,split_weights=1.,split_inclusion=.7) 
#при каждом разделении в подвыборку  попадает split_inclusion примеров из другой половины выборки с весом split_weights
#print [split[i].events.shape[0] for i in split]
#print [sum(split[i].weights) for i in split]

#Hierarchical stuff

In [None]:
%%time
#note that it equalizes weights which might be suboptimal.
trees_splitted = hierarchy.train_splitted_boosts(trees, trainFactory,criteria,
                                                 breadth = 1,
                                                 loss = MSELoss,
                                                 learning_rate = 0.005, 
                                                 nTrees_leaf= 600,
                                                 trees_sample_size=500,
                                                 regularizer =0.0004,
                                                 verbose=True,use_joblib = True,n_jobs = -1,
                                                 weights_outside_leaf = 0.75**.25, inclusion_outside_leaf = 0.**.25) 


In [None]:
y_pred_splitted= hierarchy.predict_splitted(testFactory,criteria,trees_splitted)

In [None]:
w_test = testFactory.weights
Yts = testFactory.labels
print 'spltd\t',metrics.mean_squared_error(Yts,y_pred_splitted)
print 'greedy\t',metrics.mean_squared_error(Yts,y_pred_greedy)
print 'stupid\t',metrics.mean_squared_error(Yts,y_pred_stupid)
print 'full\t',metrics.mean_squared_error(Yts,y_pred_full)
print "well..."

#MSE learning curves

In [None]:
def learning_curve(formula,factory, metric,split_criteria = None):
    
    splitted = split_criteria is not None
    lcurve = []

    Ypred = np.zeros(len(factory.labels))
    lcurve.append(metric(factory.labels, Ypred,sample_weight = factory.weights))
                  
    for tree in formula:
        if splitted:
            trees_i = {code:formula[code][i] for code in formula}
            tree_pred = hierarchy.predict_splitted(factory,split_criteria,trees_i)
        else:
            tree_pred = factory.predict([tree])

        Ypred += tree_pred
        
        lcurve.append(metric(factory.labels, Ypred,sample_weight = factory.weights))
        
    return lcurve

In [None]:
metric_name = 'MSE'
n_trees = 600
metric = metrics.mean_squared_error

stupid_lcurve = learning_curve(trees[:600],testFactory,metric)
greedy_lcurve = learning_curve(res_greedy[:600],testFactory,metric)
splitted_lcurve = learning_curve(res_greedy[:600],testFactory,metric,criteria)

p = range(1,n_trees)
plt.figure(figsize = [14,14])
plt.plot(p,[0.935834322801 for i in range(1,n_trees)],label = "full")
plt.plot(p,stupid_lcurve[1:n_trees],label = "stupid")
plt.plot(p,greedy_lcurve[1:n_trees],label = "greedy")
plt.plot(p,splitted_lcurve[1:n_trees],label = "splitted")
plt.title('learning curves('+metric_name+')')
plt.legend(loc="lower right")
