In [1]:
from fynesse import access, assess, address

from functools import partial
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tabulate import tabulate
from tqdm import tqdm

In [2]:
from config import MODELS, MODELS_EXT, EVENTS, SEEDS

## Utilities

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier


def record_evaluation(memo, clf_name, events, n_bins, seed, clf, X_test, y_test, X_test_unseen, y_test_unseen):
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    accuracy_unseen = accuracy_score(y_test_unseen, clf.predict(X_test_unseen))


    memo.append([clf_name, events, n_bins, seed, accuracy, accuracy_unseen])


def evaluate_feature(master_dict, memo, events, n_bins, models=MODELS+MODELS_EXT, train_size=16):

    for test_idx in range(len(SEEDS)):
        X = address.make_features(master_dict, events, n_bins=n_bins, models=models, n_samples=100*len(SEEDS))

        # split based on random seed value
        X, X_test_unseen = address.seed_split(X, seed_idx=test_idx, n_seeds=len(SEEDS), n_samples=100)

        # feature scaling
        scaler = StandardScaler().fit(X)
        X = scaler.transform(X)
        X_test_unseen = scaler.transform(X_test_unseen)

        # add labels
        le = LabelEncoder().fit(models)

        y = list()
        for model in models:
            y += [model] * (100*(len(SEEDS)-1))
        y = le.transform(y)

        y_test_unseen = list()
        for model in models:
            y_test_unseen += [model] * 100
        y_test_unseen = le.transform(y_test_unseen)

        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=len(models)*train_size/len(X), random_state=42, stratify=y)

        # KNN
        knn = KNeighborsClassifier(n_neighbors=10,
                                p=1,
                                weights='uniform')

        knn.fit(X_train, y_train)
        record_evaluation(memo, 'KNN', events, n_bins, SEEDS[test_idx], knn, X_test, y_test, X_test_unseen, y_test_unseen)

        # DT
        dt = DecisionTreeClassifier(criterion='gini',
                                random_state=42,
                                splitter='best')

        dt.fit(X_train, y_train)
        record_evaluation(memo, 'DT', events, n_bins, SEEDS[test_idx], dt, X_test, y_test, X_test_unseen, y_test_unseen)

        # RF
        rf = RandomForestClassifier(criterion='gini',
                                n_estimators=5,
                                random_state=42)

        rf.fit(X_train, y_train)
        record_evaluation(memo, 'RF', events, n_bins, SEEDS[test_idx], rf, X_test, y_test, X_test_unseen, y_test_unseen)

        # MLP
        mlp = MLPClassifier(activation='identity',
                        hidden_layer_sizes=[3], 
                        learning_rate_init=0.01,
                        max_iter=2500, 
                        momentum=0.99,
                        random_state=42,
                        solver='sgd')

        mlp.fit(X_train, y_train)
        record_evaluation(memo, 'MLP', events, n_bins, SEEDS[test_idx], mlp, X_test, y_test, X_test_unseen, y_test_unseen)

        # SVM
        svm = SGDClassifier(loss='hinge',
                        random_state=42)

        svm.fit(X_train, y_train)
        record_evaluation(memo, 'SVM', events, n_bins, SEEDS[test_idx], svm, X_test, y_test, X_test_unseen, y_test_unseen)


## Feature Construction

In [4]:
# load master dictionary for everything

master_dict = assess.eat_pickle(f'./data/pickle/master.pickle')

In [5]:
memo = []

for events in tqdm(combinations(EVENTS, 4)):
    for n_bins in [1, 4, 16, 64, 128]:
        evaluate_feature(master_dict, memo, events, n_bins)

1365it [9:06:18, 24.01s/it] 


In [6]:
df = pd.DataFrame(memo, columns=['events', 'n_bins', 'seed', 'accuracy', 'f1', 'precision', 'recall', 'accuracy_unseen', 'f1_unseen', 'precision_unseen', 'recall_unseen'])
df.to_csv('./data/feature_eval.csv', index=False)

In [7]:
df.sort_values(by='accuracy', ascending=False)['events'].values

array([('L1-dcache-load-misses', 'LLC-load-misses', 'fp_arith_inst_retired.128b_packed_single', 'fp_arith_inst_retired.256b_packed_single'),
       ('L1-dcache-load-misses', 'L1-dcache-loads', 'LLC-store-misses', 'fp_arith_inst_retired.scalar_double'),
       ('L1-dcache-load-misses', 'L1-dcache-loads', 'LLC-store-misses', 'fp_arith_inst_retired.512b_packed_single'),
       ...,
       ('cache-references', 'LLC-load-misses', 'LLC-loads', 'fp_arith_inst_retired.128b_packed_single'),
       ('cache-references', 'LLC-load-misses', 'LLC-loads', 'fp_arith_inst_retired.128b_packed_single'),
       ('L1-dcache-load-misses', 'LLC-load-misses', 'LLC-loads', 'fp_arith_inst_retired.128b_packed_single')],
      dtype=object)

In [11]:
for events in df.loc[(df['accuracy'] == 1) & (df['accuracy_unseen'] == 1), 'events']:
    if 'fp_arith_inst_retired.256b_packed_single' in events and  'fp_arith_inst_retired.512b_packed_single' in events:
        print(events)

('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')
('instructions', 'cache-misses',

### Extension

In [20]:
# load master dictionary for everything

master_dict_ext = assess.eat_pickle(f'./data/pickle/master_ext.pickle')

In [25]:
memo = []

for events in tqdm(list(combinations(EVENTS, 1))):
    for n_bins in [1, 4, 16, 64, 128]:
        evaluate_feature(master_dict_ext, memo, events, n_bins)

100%|██████████| 15/15 [02:45<00:00, 11.01s/it]


In [26]:
df = pd.DataFrame(memo, columns=['clf', 'events', 'n_bins', 'seed', 'accuracy', 'accuracy_unseen'])
df.to_csv('./data/feature_eval_ext_ext.csv', index=False)

In [46]:
df.head()

Unnamed: 0,clf,events,n_bins,seed,accuracy,accuracy_unseen
0,KNN,"('instructions', 'cache-misses', 'cache-refere...",1,0,0.997884,0.998125
1,DT,"('instructions', 'cache-misses', 'cache-refere...",1,0,0.990234,0.995625
2,RF,"('instructions', 'cache-misses', 'cache-refere...",1,0,0.992676,0.9975
3,MLP,"('instructions', 'cache-misses', 'cache-refere...",1,0,0.996908,0.99625
4,SVM,"('instructions', 'cache-misses', 'cache-refere...",1,0,0.625977,0.625


In [51]:
df.drop('clf', axis=1).groupby(['events', 'n_bins']).mean().sort_values(by='accuracy', ascending=False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,seed,accuracy,accuracy_unseen
events,n_bins,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"('cache-misses', 'cache-references', 'L1-dcache-loads', 'fp_arith_inst_retired.512b_packed_single')",16,41.4,0.986829,0.977275
"('L1-dcache-loads', 'LLC-load-misses', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.512b_packed_single')",16,41.4,0.986712,0.97995
"('cache-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.512b_packed_single', 'fp_arith_inst_retired.scalar_double')",16,41.4,0.985384,0.9805
"('L1-dcache-loads', 'LLC-load-misses', 'fp_arith_inst_retired.512b_packed_single', 'fp_arith_inst_retired.scalar_double')",16,41.4,0.985247,0.98085
"('L1-dcache-stores', 'LLC-load-misses', 'fp_arith_inst_retired.512b_packed_single', 'fp_arith_inst_retired.scalar_double')",16,41.4,0.985228,0.9812
"('instructions', 'L1-dcache-loads', 'fp_arith_inst_retired.scalar_double', 'fp_arith_inst_retired.scalar_single')",16,41.4,0.985182,0.97105
"('L1-dcache-load-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.512b_packed_single', 'fp_arith_inst_retired.scalar_double')",16,41.4,0.984746,0.977725
"('cache-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.128b_packed_single', 'fp_arith_inst_retired.512b_packed_single')",16,41.4,0.984733,0.977575
"('L1-dcache-loads', 'LLC-load-misses', 'LLC-stores', 'fp_arith_inst_retired.512b_packed_single')",16,41.4,0.984707,0.973875
"('L1-dcache-loads', 'LLC-load-misses', 'fp_arith_inst_retired.512b_packed_single', 'fp_arith_inst_retired.scalar_single')",16,41.4,0.984212,0.975925
