In [1]:
from fynesse import access, assess, address

from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tabulate import tabulate

In [2]:
from config import MODELS, MODELS_EXT, EVENTS, SEEDS

## Feature Construction

In [3]:
# load master dictionary for everything

master_dict = assess.eat_pickle(f'./data/pickle/master.pickle')

In [4]:
# choose your desired event combo

test_idx = 0

featured_events = ['cache-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.scalar_single']

X = address.make_features(master_dict, featured_events, n_bins=32, models=MODELS, n_samples=100*len(SEEDS))

# remove every 100th element?
# X = np.delete(X, np.arange(0, X.shape[0], 100), axis=0)

## Dataset Splitting

In [6]:
# split based on random seed value

X, X_test_unseen = address.seed_split(X, seed_idx=test_idx, n_seeds=len(SEEDS), n_samples=100)

# feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X_test_unseen = scaler.transform(X_test_unseen)

In [7]:
# add labels

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder().fit(MODELS)

y = list()
for model in MODELS:
    y += [model] * (100*(len(SEEDS)-1))
y = le.transform(y)

y_test_unseen = list()
for model in MODELS:
    y_test_unseen += [model] * 100
y_test_unseen = le.transform(y_test_unseen)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

## Utilities

In [10]:
from sklearn.model_selection import GridSearchCV

def print_grid_search_results(cv_results_):
    print(tabulate(np.column_stack([cv_results_['params'], cv_results_['mean_test_score'], cv_results_['rank_test_score']]), headers=['params', 'score', 'rank']))

In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def evaluate(clf, X_test, y_test):
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test), average='macro')
    precision = precision_score(y_test, clf.predict(X_test), average='macro')
    recall = recall_score(y_test, clf.predict(X_test), average='macro')

    print(f'Accuracy: {accuracy}')
    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

## $k$-NN

### Hyperparameter Tuning

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

params = {'n_neighbors': [1,3,5,10],
          'weights': ['uniform','distance'],
          'p': [1,2,3]}

clf = GridSearchCV(knn, params, cv=5)
clf.fit(X_train, y_train)

In [13]:
print_grid_search_results(clf.cv_results_)

params                                                 score    rank
--------------------------------------------------  --------  ------
{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}    0.998889       3
{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}   0.998889       3
{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}    0.998889       3
{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}   0.998889       3
{'n_neighbors': 1, 'p': 3, 'weights': 'uniform'}    0.997778      17
{'n_neighbors': 1, 'p': 3, 'weights': 'distance'}   0.997778      17
{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}    0.998889       3
{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}   0.998889       3
{'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}    0.998889       3
{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}   0.998889       3
{'n_neighbors': 3, 'p': 3, 'weights': 'uniform'}    0.997778      17
{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}   0.997778      17
{'n_neighbors': 5, 'p': 1, 'weight

### Training

In [14]:
knn = KNeighborsClassifier(n_neighbors=10,
                           p=1,
                           weights='uniform')

knn.fit(X_train, y_train)

### Evaluation

In [15]:
evaluate(knn, X_train, y_train)

Accuracy: 1.0
F1: 1.0
Precision: 1.0
Recall: 1.0


In [16]:
evaluate(knn, X_test, y_test)

Accuracy: 1.0
F1: 1.0
Precision: 1.0
Recall: 1.0


In [17]:
evaluate(knn, X_test_unseen, y_test_unseen)

Accuracy: 1.0
F1: 1.0
Precision: 1.0
Recall: 1.0


## DT

### Hyperparameter Tuning

In [18]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

params = {'criterion': ['gini','entropy','log_loss'],
          'splitter': ['best', 'random'],
          'random_state': [42]}

clf = GridSearchCV(dt, params, cv=5)
clf.fit(X_train, y_train)

In [19]:
print_grid_search_results(clf.cv_results_)

params                                                                  score    rank
-------------------------------------------------------------------  --------  ------
{'criterion': 'gini', 'random_state': 42, 'splitter': 'best'}        1              1
{'criterion': 'gini', 'random_state': 42, 'splitter': 'random'}      0.997778       4
{'criterion': 'entropy', 'random_state': 42, 'splitter': 'best'}     1              1
{'criterion': 'entropy', 'random_state': 42, 'splitter': 'random'}   0.997778       4
{'criterion': 'log_loss', 'random_state': 42, 'splitter': 'best'}    1              1
{'criterion': 'log_loss', 'random_state': 42, 'splitter': 'random'}  0.997778       4


### Training

In [20]:
dt = DecisionTreeClassifier(criterion='gini',
                            random_state=42,
                            splitter='best')

dt.fit(X_train, y_train)

### Evaluation

In [21]:
evaluate(dt, X_test, y_test)

Accuracy: 0.9933333333333333
F1: 0.9933326665999934
Precision: 0.9934640522875817
Recall: 0.9933333333333333


In [22]:
evaluate(dt, X_test_unseen, y_test_unseen)

Accuracy: 0.9933333333333333
F1: 0.9933333333333333
Precision: 0.9933333333333333
Recall: 0.9933333333333333


## RF

### Hyperparameter Tuning

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

params = {'n_estimators': [1, 5, 10, 25, 50, 100, 250, 500, 1000],
          'criterion': ['gini','entropy','log_loss'],
          'random_state': [42]}

clf = GridSearchCV(rf, params, cv=5)
clf.fit(X_train, y_train)

In [24]:
print_grid_search_results(clf.cv_results_)

params                                                                  score    rank
-------------------------------------------------------------------  --------  ------
{'criterion': 'gini', 'n_estimators': 1, 'random_state': 42}         0.994444      27
{'criterion': 'gini', 'n_estimators': 5, 'random_state': 42}         1              1
{'criterion': 'gini', 'n_estimators': 10, 'random_state': 42}        1              1
{'criterion': 'gini', 'n_estimators': 25, 'random_state': 42}        1              1
{'criterion': 'gini', 'n_estimators': 50, 'random_state': 42}        1              1
{'criterion': 'gini', 'n_estimators': 100, 'random_state': 42}       1              1
{'criterion': 'gini', 'n_estimators': 250, 'random_state': 42}       1              1
{'criterion': 'gini', 'n_estimators': 500, 'random_state': 42}       1              1
{'criterion': 'gini', 'n_estimators': 1000, 'random_state': 42}      1              1
{'criterion': 'entropy', 'n_estimators': 1, 'random_st

### Training

In [25]:
rf = RandomForestClassifier(criterion='gini',
                            n_estimators=5,
                            random_state=42)

rf.fit(X_train, y_train)

### Evaluation

In [26]:
evaluate(rf, X_test, y_test)

Accuracy: 0.9966666666666667
F1: 0.9966665833312499
Precision: 0.9966996699669967
Recall: 0.9966666666666667


In [27]:
evaluate(rf, X_test_unseen, y_test_unseen)

Accuracy: 0.9966666666666667
F1: 0.9966665833312499
Precision: 0.9966996699669967
Recall: 0.9966666666666667


## MLP

### Hyperparameter Tuning

In [28]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

params = {'hidden_layer_sizes': [[i+1] for i in range(10)],
          'activation': ['identity', 'logistic', 'tanh', 'relu'],
          'solver': ['sgd', 'adam'],
          'learning_rate_init': [0.001, 0.005, 0.01],
          'max_iter': [1000, 2500, 5000],
          'momentum': [0.9, 0.99],
          'random_state': [42]}

clf = GridSearchCV(mlp, params, cv=5)
clf.fit(X_train, y_train)



In [29]:
print_grid_search_results(clf.cv_results_)

params                                                                                                                                                            score    rank
-------------------------------------------------------------------------------------------------------------------------------------------------------------  --------  ------
{'activation': 'identity', 'hidden_layer_sizes': [1], 'learning_rate_init': 0.001, 'max_iter': 1000, 'momentum': 0.9, 'random_state': 42, 'solver': 'sgd'}     0.994444    1204
{'activation': 'identity', 'hidden_layer_sizes': [1], 'learning_rate_init': 0.001, 'max_iter': 1000, 'momentum': 0.9, 'random_state': 42, 'solver': 'adam'}    0.991111    1303
{'activation': 'identity', 'hidden_layer_sizes': [1], 'learning_rate_init': 0.001, 'max_iter': 1000, 'momentum': 0.99, 'random_state': 42, 'solver': 'sgd'}    0.996667    1030
{'activation': 'identity', 'hidden_layer_sizes': [1], 'learning_rate_init': 0.001, 'max_iter': 1000, 'momentum': 0.99, '

### Training

In [36]:
mlp = MLPClassifier(activation='identity',
                    hidden_layer_sizes=[3], 
                    learning_rate_init=0.01,
                    max_iter=2500, 
                    momentum=0.99,
                    random_state=42,
                    solver='sgd')

mlp.fit(X_train, y_train)

### Evaluation

In [37]:
evaluate(mlp, X_test, y_test)

Accuracy: 0.9966666666666667
F1: 0.9966665833312499
Precision: 0.9966996699669967
Recall: 0.9966666666666667


In [38]:
evaluate(mlp, X_test_unseen, y_test_unseen)

Accuracy: 0.9933333333333333
F1: 0.993349586214903
Precision: 0.9934640522875817
Recall: 0.9933333333333333


## PCA Visualization

In [None]:
from sklearn.decomposition import PCA

y = list()
for model in MODELS:
    y += [model] * 100*len(SEEDS)
y = le.transform(y)

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

pca = PCA(n_components=3)
pca.fit(X)

X_pca = pca.transform(X)

plt.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=y, cmap='Set1')
plt.show()

## Evaluation

In [39]:
def record_evaluation(memo, clf_name, seed, clf, X_test, y_test, X_test_unseen, y_test_unseen):
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    f1 = f1_score(y_test, clf.predict(X_test), average='macro')
    precision = precision_score(y_test, clf.predict(X_test), average='macro')
    recall = recall_score(y_test, clf.predict(X_test), average='macro')

    accuracy_unseen = accuracy_score(y_test_unseen, clf.predict(X_test_unseen))
    f1_unseen = f1_score(y_test_unseen, clf.predict(X_test_unseen), average='macro')
    precision_unseen = precision_score(y_test_unseen, clf.predict(X_test_unseen), average='macro')
    recall_unseen = recall_score(y_test_unseen, clf.predict(X_test_unseen), average='macro')

    memo.append([clf_name, seed, accuracy, accuracy_unseen, f1, f1_unseen, precision, precision_unseen, recall, recall_unseen])

In [44]:
# feature parameters
featured_events = ['cache-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.scalar_single']
n_bins = 32

memo = []

for test_idx in range(len(SEEDS)):
    X = address.make_features(master_dict, featured_events, n_bins=n_bins, models=MODELS, n_samples=100*len(SEEDS))

    # split based on random seed value
    X, X_test_unseen = address.seed_split(X, seed_idx=test_idx, n_seeds=len(SEEDS), n_samples=100)

    # feature scaling
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    X_test_unseen = scaler.transform(X_test_unseen)

    # add labels
    le = LabelEncoder().fit(MODELS)

    y = list()
    for model in MODELS:
        y += [model] * (100*(len(SEEDS)-1))
    y = le.transform(y)

    y_test_unseen = list()
    for model in MODELS:
        y_test_unseen += [model] * 100
    y_test_unseen = le.transform(y_test_unseen)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    # KNN
    knn = KNeighborsClassifier(n_neighbors=10,
                            p=1,
                            weights='uniform')

    knn.fit(X_train, y_train)
    record_evaluation(memo, 'KNN', SEEDS[test_idx], knn, X_test, y_test, X_test_unseen, y_test_unseen)

    # DT
    dt = DecisionTreeClassifier(criterion='gini',
                            random_state=42,
                            splitter='best')

    dt.fit(X_train, y_train)
    record_evaluation(memo, 'DT', SEEDS[test_idx], dt, X_test, y_test, X_test_unseen, y_test_unseen)

    # RF
    rf = RandomForestClassifier(criterion='gini',
                            n_estimators=5,
                            random_state=42)

    rf.fit(X_train, y_train)
    record_evaluation(memo, 'RF', SEEDS[test_idx], rf, X_test, y_test, X_test_unseen, y_test_unseen)

    # MLP
    mlp = MLPClassifier(activation='identity',
                    hidden_layer_sizes=[3], 
                    learning_rate_init=0.01,
                    max_iter=2500, 
                    momentum=0.99,
                    random_state=42,
                    solver='sgd')

    mlp.fit(X_train, y_train)
    record_evaluation(memo, 'MLP', SEEDS[test_idx], mlp, X_test, y_test, X_test_unseen, y_test_unseen)

In [45]:
df = pd.DataFrame(memo, columns=['clf', 'seed', 'accuracy', 'accuracy_unseen', 'f1', 'f1_unseen', 'precision', 'precision_unseen', 'recall', 'recall_unseen'])
df.to_csv('./data/core_eval.csv')

### Extension

In [None]:
# load master dictionary for everything

master_dict_ext = assess.eat_pickle(f'./data/pickle/master_ext.pickle')

In [None]:
# feature parameters
featured_events = ['cache-misses', 'L1-dcache-loads', 'fp_arith_inst_retired.256b_packed_single', 'fp_arith_inst_retired.scalar_single']
n_bins = 32

memo = []

for test_idx in range(len(SEEDS)):
    X = address.make_features(master_dict, featured_events, n_bins=n_bins, models=MODELS+MODELS_EXT, n_samples=100*len(SEEDS))

    # split based on random seed value
    X, X_test_unseen = address.seed_split(X, seed_idx=test_idx, n_seeds=len(SEEDS), n_samples=100)

    # feature scaling
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    X_test_unseen = scaler.transform(X_test_unseen)

    # add labels
    le = LabelEncoder().fit(MODELS)

    y = list()
    for model in MODELS:
        y += [model] * (100*(len(SEEDS)-1))
    y = le.transform(y)

    y_test_unseen = list()
    for model in MODELS:
        y_test_unseen += [model] * 100
    y_test_unseen = le.transform(y_test_unseen)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    # KNN
    knn = KNeighborsClassifier(n_neighbors=10,
                            p=1,
                            weights='uniform')

    knn.fit(X_train, y_train)
    record_evaluation(memo, 'KNN', SEEDS[test_idx], knn, X_test, y_test, X_test_unseen, y_test_unseen)

    # DT
    dt = DecisionTreeClassifier(criterion='gini',
                            random_state=42,
                            splitter='best')

    dt.fit(X_train, y_train)
    record_evaluation(memo, 'DT', SEEDS[test_idx], dt, X_test, y_test, X_test_unseen, y_test_unseen)

    # RF
    rf = RandomForestClassifier(criterion='gini',
                            n_estimators=5,
                            random_state=42)

    rf.fit(X_train, y_train)
    record_evaluation(memo, 'RF', SEEDS[test_idx], rf, X_test, y_test, X_test_unseen, y_test_unseen)

    # MLP
    mlp = MLPClassifier(activation='identity',
                    hidden_layer_sizes=[3], 
                    learning_rate_init=0.01,
                    max_iter=2500, 
                    momentum=0.99,
                    random_state=42,
                    solver='sgd')

    mlp.fit(X_train, y_train)
    record_evaluation(memo, 'MLP', SEEDS[test_idx], mlp, X_test, y_test, X_test_unseen, y_test_unseen)

In [None]:
df = pd.DataFrame(memo, columns=['clf', 'seed', 'accuracy', 'accuracy_unseen', 'f1', 'f1_unseen', 'precision', 'precision_unseen', 'recall', 'recall_unseen'])
df.to_csv('./data/ext_eval.csv')