### Beetle


> @article{beetle,
  author    = {Rahul Krishna and
               Vivek Nair and
               Pooyan Jamshidi and
               Tim Menzies},
  title     = {Whence to Learn? Transferring Knowledge in Configurable Systems using
               {BEETLE}},
  journal   = {CoRR},
  volume    = {abs/1911.01817},
  year      = {2019},
  url       = {http://arxiv.org/abs/1911.01817},
  archivePrefix = {arXiv},
  eprint    = {1911.01817},
  timestamp = {Mon, 11 Nov 2019 18:38:09 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1911-01817.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

**Beetle** is a transfer learning approach defined by Krishna et al that relies on *source selection*. 
Given a (set of) input(s), the goal is to rank the sources by performance, in order to discover a bellwether input from which we can easily transfer performances (i.e. find the best source). 
Then, we transfer performances from this bellwether input to all inputs of the test set. 
We only apply the discovery phase (i.e. the search of bellwether) on the training set, to avoid introducing any bias in the results. 

#### Libraries

In [3]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split, GridSearchCV
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# Support vector machine - support vector regressor
from sklearn.svm import SVR
# decision trees
from sklearn.tree import DecisionTreeRegressor, plot_tree
# mean squared error
from sklearn.metrics import mean_squared_error

# gradient boosting trees
from xgboost import XGBRegressor

# we use it to interact with the file system
import os
# compute time
from time import time

# Neural network high level framework
import keras
# Sequential is a sequence of blocs
# Input deals with the data fed to the network
from keras.models import Sequential,Input,Model
# Dense is a feedforward layer with fully connected nodes
# Dropout allows to keep part of data, and to "drop out" a the rest
# Flatten makes the data "flat", i.e. in one dimension
from keras.layers import Dense, Dropout, Flatten
# Conv -> convolution, MaxPooling is relative to Pooling
# Activation if the function composing the data in output of a layer
from keras.layers import Conv2D, MaxPooling2D, Activation

Using TensorFlow backend.


In [110]:
class Beetle:
    
    def __init__(self):
        #self.pct_test = pct_test
        #self.ratio_exploitation = ratio_exploitation
        
        # the data folder, see the markdown there for additional explanations
        res_dir = "../../../data/ugc/res_ugc/"
        
        # the list of videos names, e.g. Animation_360P-3e40
        # we sort the list so we keep the same ids between two launches
        v_names = sorted(os.listdir(res_dir))

        self.predDimension = "kbs"
        
        # the list of measurements
        listVideo = []

        # we add each dataset in the list, converting the time to the right format
        # third line asserts that the measures are complete
        for v in v_names:
            data = pd.read_table(res_dir+v, delimiter = ',')
            inter = pd.get_dummies(data)
            inter[self.predDimension] = data[self.predDimension]
            listVideo.append(inter)
        
        self.listVideo = listVideo
        
        
        # to sample the source and the target using the same seed
        self.random_state = np.random.randint(0,1000)
        
        self.features = ['cabac', '8x8dct', 'mbtree', 'rc_lookahead', 'analyse', 'me', 'subme', 'mixed_ref', 'me_range', 
                 'qpmax', 'aq-mode', 'trellis','fast_pskip', 'chroma_qp_offset', 'bframes', 'b_pyramid', 
                 'b_adapt', 'direct', 'ref', 'deblock', 'weightb', 'open_gop', 'weightp', 'scenecut']
    
    def mse(self, y_true, y_pred):
        return np.mean((y_true-y_pred)**2)
         
    def find_bellwether(self, index_train):
        
        # GOAL : Find a good input source to transfer performances
        
        # first, we isolate the training set, and separate in two categories
        
        listInputs = [self.listVideo[i] for i in index_train]
        
        # on one side the sources,on the other side the targets
        sources_index, targets_index = train_test_split(index_train, test_size = 50)
        
        #sources = [self.listVideo[i] for i in sources_index]
        #targets = [self.listVideo[i] for i in targets_index]
        
        results_beetle = dict()
        
        # for these 50 target videos, we transfer the performances of all the sources
        # and keep the best source (i.e. the one leading to the best mse)
        # finally, the bellwether is the video of the training set that
        # give the best results for most of the target videos
        
        for index_t in targets_index:
            t = self.listVideo[index_t]
            X_tgt = t.drop([self.predDimension], axis = 1)
            y_tgt = np.array(t[self.predDimension], dtype=float)
            for index_s in sources_index:
                s = self.listVideo[index_s]
                X_src = s.drop([self.predDimension], axis = 1)
                y_src = np.array(s[self.predDimension], dtype=float)
                X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                                y_src, 
                                                                                train_size=0.7)
                la = RandomForestRegressor()
                la.fit(X_src_train, y_src_train)
                y_tgt_pred = la.predict(X_src)
                results_beetle[(index_t, index_s)] = self.mse(y_tgt_pred, y_tgt)
            
        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return results_beetle
    
    def learn(self, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor):

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)
        
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        source = self.listVideo[459]
        X_src = source.drop([self.predDimension], axis = 1)
        y_src = np.array(source[self.predDimension], dtype=float)
        X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                            y_src, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)
        
        lf = learning_algorithm()
        lf.fit(X_src_train, y_src_train)
        y_src_pred = np.array(lf.predict(X_src)).reshape(-1,1)
        
        target = self.listVideo[target_id]
        X_tgt = target.drop([self.predDimension], axis = 1)
        y_tgt = np.array(source[self.predDimension], dtype=float)

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return self.mse(y_src_pred, y_tgt)
    
    def predict_conf(self, target_id, train_size, 
                    learning_algorithm = RandomForestRegressor):

        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations for training and testing
        random_state = np.random.randint(0,1000)
        
        #print(X_src_train.shape)
        # We define the target video, and split it into train-test
        source = self.listVideo[459]
        X_src = source.drop([self.predDimension], axis = 1)
        y_src = np.array(source[self.predDimension], dtype=float)
        X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                            y_src, 
                                                                            train_size=train_size, 
                                                                            random_state=random_state)
        
        lf = learning_algorithm()
        lf.fit(X_src_train, y_src_train)
        y_src_pred_test = np.array(lf.predict(X_src)).reshape(-1,1)

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 

        # We return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        return np.argmin(y_src_pred_test)

In [111]:
beetle = Beetle()

In [112]:
# the data folder, see the markdown there for additional explanations
res_dir = "../../../data/ugc/res_ugc/"

# the list of videos names, e.g. Animation_360P-3e40
# we sort the list so we keep the same ids between two launches
v_names = sorted(os.listdir(res_dir)) 

v_names_train = np.loadtxt("../../../results/raw_data/train_names.csv", dtype= str)
v_names_test = np.loadtxt("../../../results/raw_data/test_names.csv", dtype= str)
index_train = [i for i in range(len(v_names)) if v_names[i] in v_names_train]
index_test = [i for i in range(len(v_names)) if v_names[i] in v_names_test]

train_sizes = np.arange(5,31,5)

#### Find a bellwhether input video

We search for a good source of transfer

In [113]:
#results_mse_beetle = beetle.find_bellwether(index_train)

In [114]:
results_mse_beetle

{(1371, 261): 32026.72760816848,
 (1371, 287): 320050.5104023576,
 (1371, 249): 184835.74623490975,
 (1371, 377): 15753760.742637599,
 (1371, 426): 112675.15815496627,
 (1371, 505): 1165279.6897556633,
 (1371, 475): 23109363.877881587,
 (1371, 410): 272447.0251584397,
 (1371, 91): 36373210.61039268,
 (1371, 20): 7332512.447100236,
 (1371, 903): 2554117382.5365143,
 (1371, 560): 143496781.50428247,
 (1371, 114): 616175.9780099022,
 (1371, 895): 2518136802.4581437,
 (1371, 232): 538623710.4305638,
 (1371, 366): 739211.9410972204,
 (1371, 1074): 5324012.144146831,
 (1371, 1254): 597025161.1266451,
 (1371, 297): 207516174.06544486,
 (1371, 623): 18692919.73912521,
 (1371, 613): 1619704.7821560726,
 (1371, 1057): 580202.530028709,
 (1371, 1167): 124360112.41314265,
 (1371, 768): 124235677.97327493,
 (1371, 876): 83691369.62872286,
 (1371, 551): 1395049.9803319334,
 (1371, 160): 81833332.7864425,
 (1371, 387): 118942.20226204966,
 (1371, 950): 200709.285787804,
 (1371, 406): 87063.9058399579

In [115]:
target_ids = pd.Series([a[0] for a in results_mse_beetle.keys()]).unique()
target_ids

array([1371, 1345,  423,  646, 1303,  793,  797,  171,  102,  433,  839,
       1079, 1202,  431,  383,  723,  564, 1312,  314,  412, 1121,  642,
        927,  692,  815, 1011,  699,  516,   88,  128,  758,  142,  207,
        961,  855,  586, 1267, 1363, 1294,  605,   48, 1314,  728, 1062,
        539,  359,  345, 1151,  543,   61])

In [116]:
source_ids = pd.Series([a[1] for a in results_mse_beetle.keys()]).unique()
source_ids[0:100]

array([ 261,  287,  249,  377,  426,  505,  475,  410,   91,   20,  903,
        560,  114,  895,  232,  366, 1074, 1254,  297,  623,  613, 1057,
       1167,  768,  876,  551,  160,  387,  950,  406,  748,  834,  766,
        143,  408,  166, 1028, 1288,  628,  407, 1389, 1014, 1367,  844,
        474,  849, 1016, 1209,  522,  227,  358, 1000,  512, 1051,  326,
         26,  976, 1243,  203,  558, 1160, 1052,  508,  312,  983,  663,
       1059,  154,  681,  776, 1230, 1017, 1126,  251,  878, 1347,  603,
         84,  477, 1236,  686,  929,  701,  514, 1112,  655,  293,  453,
       1129, 1394,   31, 1140,   95,  703,  390,  676, 1390,  443, 1037,
       1284])

In [117]:
mse_data = pd.DataFrame({"source_ids" : source_ids})
for tid in target_ids:
    mse_data[str(tid)] = [results_mse_beetle[(tid, sid)] for sid in source_ids]
# mse_data.to_csv("../../../results/raw_data/mse_beetle_results.csv")
mse_data

Unnamed: 0,source_ids,1371,1345,423,646,1303,793,797,171,102,...,48,1314,728,1062,539,359,345,1151,543,61
0,261,3.202673e+04,2.262743e+05,1.072165e+07,4.318441e+06,6.526991e+09,3.082167e+05,6.188199e+05,3.085057e+08,3.970492e+07,...,1.614989e+07,3.625737e+09,5.176012e+04,5.174250e+05,1.245013e+06,9.801105e+07,1.030111e+09,1.121817e+08,6.663460e+05,3.497908e+05
1,287,3.200505e+05,9.599532e+05,1.242343e+07,6.127276e+06,6.580498e+09,8.680487e+05,3.887977e+05,3.203447e+08,4.159576e+07,...,1.857686e+07,3.668429e+09,4.242752e+05,8.408055e+05,7.580978e+05,1.049041e+08,1.035491e+09,1.198932e+08,1.088139e+06,2.244201e+05
2,249,1.848357e+05,8.216849e+05,1.353607e+07,6.448925e+06,6.597675e+09,7.564058e+05,1.130988e+05,3.241879e+08,4.490556e+07,...,1.947765e+07,3.679374e+09,3.231118e+05,1.208609e+06,4.473185e+05,1.070320e+08,1.053315e+09,1.215788e+08,1.266719e+06,3.060931e+04
3,377,1.575376e+07,1.232790e+07,2.688073e+06,3.739861e+06,5.932545e+09,1.319532e+07,2.202944e+07,1.908584e+08,1.005446e+07,...,3.340026e+06,3.182949e+09,1.479766e+07,1.141033e+07,2.513985e+07,3.707783e+07,8.392269e+08,4.841957e+07,1.236765e+07,2.025369e+07
4,426,1.126752e+05,5.385922e+05,1.123770e+07,5.116559e+06,6.548042e+09,5.294205e+05,4.804214e+05,3.137778e+08,4.006631e+07,...,1.708838e+07,3.646864e+09,1.800589e+05,5.549259e+05,9.743204e+05,1.007897e+08,1.029250e+09,1.156306e+08,7.363353e+05,2.561176e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,422,1.540944e+05,3.308302e+05,9.652681e+06,4.185377e+06,6.508431e+09,3.752252e+05,8.626531e+05,3.051169e+08,3.715469e+07,...,1.526797e+07,3.617134e+09,1.415058e+05,2.816652e+05,1.503949e+06,9.615813e+07,1.017821e+09,1.105325e+08,5.931820e+05,5.336955e+05
996,398,1.264522e+05,3.210885e+05,9.741333e+06,4.286505e+06,6.509263e+09,3.044249e+05,8.164484e+05,3.054387e+08,3.722345e+07,...,1.524020e+07,3.617816e+09,1.090215e+05,2.896400e+05,1.485820e+06,9.635193e+07,1.019411e+09,1.105147e+08,6.024782e+05,5.116465e+05
997,147,3.205158e+05,1.138603e+06,1.470624e+07,6.759474e+06,6.618403e+09,1.184100e+06,1.726668e+05,3.287956e+08,4.634105e+07,...,2.101606e+07,3.693040e+09,6.010175e+05,1.444886e+06,4.069176e+05,1.085064e+08,1.052731e+09,1.247898e+08,1.149461e+06,7.865283e+04
998,236,1.962606e+05,8.484311e+05,1.347300e+07,6.449112e+06,6.597648e+09,7.812270e+05,1.155923e+05,3.242343e+08,4.456668e+07,...,1.945698e+07,3.680267e+09,3.331611e+05,1.158332e+06,4.399241e+05,1.070619e+08,1.051230e+09,1.216697e+08,1.264013e+06,2.708535e+04


#### Then, we select the best source, minimizing the sum of MSE, i.e. the bellwhether input video

In [118]:
df = mse_data.set_index('source_ids')
for t in target_ids:
    df[str(t)]=df[str(t)]/np.mean(df[str(t)])
source_ids[np.argmin([np.sum(df.iloc[i]) for i in range(len(source_ids))])]

459

#### The bellwether input is the video with the id # 459!

#### Learning algorithm

In [124]:
LAs = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor, XGBRegressor, SVR]
for i in range(5):
    target_id = np.random.randint(0,1000)
    for la in LAs:
        print(la, beetle.learn(target_id = target_id, train_size=20, learning_algorithm=la))

<class 'sklearn.linear_model._base.LinearRegression'> 7287452.903020396
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 7908660.494628688
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 5204935.389192171
<class 'xgboost.sklearn.XGBRegressor'> 5568924.323390234
<class 'sklearn.svm._classes.SVR'> 4157672.1707222858
<class 'sklearn.linear_model._base.LinearRegression'> 7287451.009812243
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 7577384.640367314
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 5941277.766099082
<class 'xgboost.sklearn.XGBRegressor'> 5593892.110586354
<class 'sklearn.svm._classes.SVR'> 4181079.372169836
<class 'sklearn.linear_model._base.LinearRegression'> 7287450.8548344625
<class 'sklearn.tree._classes.DecisionTreeRegressor'> 7601120.123639784
<class 'sklearn.ensemble._forest.RandomForestRegressor'> 4790747.977584629
<class 'xgboost.sklearn.XGBRegressor'> 7487327.475946847
<class 'sklearn.svm._classes.SVR'> 3711599.8254402387
<class 'sk

#### Chosen algorithm :  SVR (however it may depends on the choice of videos)

But we stick to RF because SVR always returns the same config, very strange result.

We predict the configurations for each video of the test set, for 5 configs, 10 configs, ..., 30 configs in the training set.

In [125]:
beetle_confs = dict()
for i in range(len(index_test)):
    it = index_test[i]
    for ts in train_sizes:
        beetle_confs[(i, ts)] = beetle.predict_conf(target_id = it, train_size=ts,
                                      learning_algorithm = RandomForestRegressor)

In [126]:
beetle_confs

{(0, 5): 196,
 (0, 10): 169,
 (0, 15): 195,
 (0, 20): 85,
 (0, 25): 91,
 (0, 30): 91,
 (1, 5): 108,
 (1, 10): 168,
 (1, 15): 62,
 (1, 20): 108,
 (1, 25): 195,
 (1, 30): 169,
 (2, 5): 96,
 (2, 10): 107,
 (2, 15): 101,
 (2, 20): 173,
 (2, 25): 96,
 (2, 30): 169,
 (3, 5): 153,
 (3, 10): 94,
 (3, 15): 96,
 (3, 20): 169,
 (3, 25): 168,
 (3, 30): 173,
 (4, 5): 123,
 (4, 10): 87,
 (4, 15): 91,
 (4, 20): 101,
 (4, 25): 107,
 (4, 30): 200,
 (5, 5): 94,
 (5, 10): 167,
 (5, 15): 91,
 (5, 20): 169,
 (5, 25): 170,
 (5, 30): 168,
 (6, 5): 101,
 (6, 10): 67,
 (6, 15): 24,
 (6, 20): 93,
 (6, 25): 165,
 (6, 30): 168,
 (7, 5): 27,
 (7, 10): 169,
 (7, 15): 100,
 (7, 20): 170,
 (7, 25): 169,
 (7, 30): 168,
 (8, 5): 168,
 (8, 10): 134,
 (8, 15): 102,
 (8, 20): 75,
 (8, 25): 97,
 (8, 30): 189,
 (9, 5): 91,
 (9, 10): 91,
 (9, 15): 93,
 (9, 20): 97,
 (9, 25): 123,
 (9, 30): 169,
 (10, 5): 106,
 (10, 10): 107,
 (10, 15): 101,
 (10, 20): 171,
 (10, 25): 159,
 (10, 30): 169,
 (11, 5): 85,
 (11, 10): 165,
 (11, 1

In [127]:
beetle_data = pd.DataFrame({"id_video" : [i for i in range(len(index_test))]})
for ts in train_sizes:
    beetle_data["conf"+str(ts)] = [beetle_confs[(i, ts)] for i in range(len(index_test))]

In [128]:
beetle_data.set_index("id_video").to_csv("../../../results/raw_data/Beetle_results.csv")