In [1]:
from fynesse import access, assess, address

from functools import partial
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tabulate import tabulate

In [2]:
from config import MODELS, EVENTS, SEEDS

## Utilities

In [3]:
from sklearn.model_selection import GridSearchCV

def print_grid_search_results(cv_results_):
    print(tabulate(np.column_stack([cv_results_['params'], cv_results_['mean_test_score'], cv_results_['rank_test_score']]), headers=['params', 'score', 'rank']))

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier

def evaluate_feature(master_dict, featured_events, n_bins):

    scores, scores_unseen = [], []

    for i, seed in enumerate(SEEDS):
        X = address.make_features(master_dict, featured_events, n_bins=n_bins, models=MODELS, n_samples=500)
        scaler = StandardScaler().fit(X)
        X = scaler.transform(X)

        X, X_test_unseen = address.seed_split(X, seed_idx=i, n_seeds=len(SEEDS))

        le = LabelEncoder().fit(MODELS)

        y = list()
        for model in MODELS:
            y += [model] * 400
        y = le.transform(y)

        y_test_unseen = list()
        for model in MODELS:
            y_test_unseen += [model] * 100
        y_test_unseen = le.transform(y_test_unseen)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

        knn = KNeighborsClassifier(n_neighbors=3, p=2, weights='uniform')
        
        knn.fit(X_train, y_train)

        scores.append(accuracy_score(y_test, knn.predict(X_test)))
        scores_unseen.append(accuracy_score(y_test_unseen, knn.predict(X_test_unseen)))

    scores = np.column_stack(scores)
    scores_unseen = np.column_stack(scores_unseen)
    
    return scores, scores_unseen

## Feature Construction

In [6]:
# load master dictionary for everything

master_dict = assess.eat_pickle(f'./data/pickle/master.pickle')

In [20]:
results = []

for n_bins in [1,2,4,8,16,32,64]:
    for featured_events in combinations(EVENTS, 4):
        
        row = [n_bins, featured_events]
        scores, scores_unseen = evaluate_feature(master_dict, featured_events, n_bins)
        
        row += [np.mean(scores, axis=1)[0], np.mean(scores_unseen, axis=1)[0]]
        results.append(row)

In [21]:
df = pd.DataFrame(results, columns=['n_bins', 'events', 'accuracy', 'accuracy_unseen'])
df.to_csv('./data/features_new.csv')

In [34]:
df.sort_values(by='accuracy', ascending=False).iloc[6]['events']

('cache-misses',
 'cache-references',
 'L1-dcache-load-misses',
 'fp_arith_inst_retired.256b_packed_single')

## Feature Importance