In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import collections
import itertools

# Tools
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

le = LabelEncoder()

# Benchmark

## Combine Original Labels

In [None]:
files = ['label0.csv', 'label1.csv', 'label2_1.csv', 'label2_2.csv', 'label3.csv', 'label3_split0.csv', 'label3_split1.csv', 'label3_split2.csv', 'label3_split3.csv']
label_rank_df = pd.DataFrame()
label_rank_df = pd.concat([pd.read_csv(f, header = None, names=['filename', 1, 2, 4, 8, 16 , 32, 64, 128]) for f in files], ignore_index = True)
label_rank_df.set_index('filename', inplace = True)
label_rank_df.to_csv("combined_labels.csv", header=None)

In [None]:
# run the average.py to get the average_combined_labels.cv
averaged_label_df = pd.read_csv("averaged_combined_labels.csv", header=None, names=['filename', 1, 2, 4, 8, 16 , 32, 64, 128])
averaged_label_df.set_index("filename", inplace=True)

In [None]:
averaged_label_df

In [None]:
def find_rank(row):
    row_list = np.array(row)
    index_sorted = np.argsort(row_list)
    ranking = [2**i for i in index_sorted]
    for i, rank in enumerate(ranking):
        row[rank] = i+1
    return row

In [None]:
label_rank_df = averaged_label_df.apply(lambda x : find_rank(x), axis = 1)

In [None]:
label_rank_df.to_csv("label_rank.csv")

# Model Training

In [None]:
feature_df = pd.read_csv("full_features.csv") # change to corresponding feature csv
feature_df = feature_df.set_index("Filename")
label_df = pd.read_csv("label_final.csv") # change to corresponding label csv
label_df = label_df.set_index("Filename")
dataset_df = feature_df.join(label_df, on="Filename", how='inner')

In [None]:
feature_cols = ['Depth', 'TripCount', 'Total', 'FP', 'BR', 'Mem', 'Uses', 'Defs']
X = dataset_df.loc[:, feature_cols].to_numpy()
y = dataset_df["Optimal Unroll Factor"].to_numpy()

In [None]:
clfs = [DecisionTreeClassifier(), RandomForestClassifier(n_estimators=500), SVC(), LinearSVC(dual='auto'), KNeighborsClassifier(), MLPClassifier(), XGBClassifier(n_estimators=500)]

In [None]:
# X_train, X_test = X[:int(X.shape[0]*0.9)], X[int(X.shape[0]*0.1):]
# y_train, y_test = y[:int(X.shape[0]*0.9)], y[int(X.shape[0]*0.1):]

# clf.fit(X_train, y_train)
# y_predicted = clf.predict(X_test)
# metrics.accuracy_score(y_test, y_predicted)

In [None]:
def model_performance(clf):
    scores = []
    rank_pred = []
    skf = StratifiedKFold(n_splits=5, shuffle = True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        if type(clf).__name__ == "XGBClassifier":
            y_train = le.fit_transform(y_train)
            y_test = le.fit_transform(y_test)
        clf.fit(X_train, y_train)
        y_predicted = clf.predict(X_test)
        scores.append(metrics.accuracy_score(y_test, y_predicted))
        
        # store the ranks
        filenames = pd.Series(dataset_df.index.to_list())[test_index]
        if type(clf).__name__ == "XGBClassifier":
            ranks = [label_rank_df.loc[f][2**y] for f, y in zip(filenames, y_predicted)]
        else:
            ranks = [label_rank_df.loc[f][y] for f, y in zip(filenames, y_predicted)]
        rank_pred.append(ranks)

    # Rank analysis
    rank_counts = dict(collections.Counter(list(itertools.chain.from_iterable(rank_pred))))
    total_sum = sum(rank_counts.values())
    averaged_dict = {key: value / total_sum for key, value in rank_counts.items()}

    result = {
        'Model': type(clf).__name__,
        "Accuracy %": np.array(scores).mean() * 100,
        "Top 3 Prediction %": (averaged_dict[1] + averaged_dict[2] + averaged_dict[3]) * 100
    }
    # top2_percentage = (averaged_dict[1] + averaged_dict[2]) * 100
    # accuracy_average = np.array(scores).mean() * 100
    return result

In [None]:
model_result = pd.DataFrame(columns=["Model", "Accuracy %", "Top 3 Prediction %"])
for clf in clfs:
    model_result = pd.concat([model_result, pd.DataFrame([model_performance(clf)])],ignore_index=True)
model_result.set_index("Model", inplace=True)

In [None]:
model_result

In [None]:
model_result.to_csv("model_result.csv")

In [None]:
scores = []
rank_pred = []
skf = StratifiedKFold(n_splits=5, shuffle = True)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_predicted))
    
    filenames = pd.Series(dataset_df.index.to_list())[test_index]
    ranks = [label_rank_df.loc[f][y] for f, y in zip(filenames, y_predicted)]
    rank_pred.append(ranks)
    
# Put the performance of the model on each fold in the scores array
np.array(scores).mean()

In [None]:
rank_counts = dict(collections.Counter(list(itertools.chain.from_iterable(rank_pred))))

total_sum = sum(rank_counts.values())

averaged_dict = {key: value / total_sum for key, value in rank_counts.items()}

averaged_dict