In [None]:
%matplotlib inline

from __future__ import print_function

import pandas as pd
import numpy as np
import common_libs.utilities as ut
import matplotlib.pyplot as plt
import torch
import scipy.optimize as opt
import random
import re
import cvxopt

In [None]:
t = pd.read_sql('SELECT * FROM time JOIN time_metadata ON time.time_id=time_metadata.time_id WHERE kind_id=1 ORDER BY RAND() LIMIT 999999999', ut.create_connection()).set_index('time_id')

In [None]:
good_times = t[(t['l1drmisses'] <= 0) & (t['l1dwmisses'] <= 0) & (t['l1imisses'] <= 0) & (t['conswitch'] <= 0)]

In [None]:
def percentile(k):
    return lambda a: np.percentile(a, k)

In [None]:
summaries = good_times.groupby('code_id').agg({'code_id': 'count', 'cycle_count': percentile(20)})

In [None]:
true_cycles = summaries[summaries['code_id'] > 1]['cycle_count']

In [None]:
true_cycles = true_cycles.drop(4, axis=0)

In [None]:
data = torch.load('../saved/data_040519.data')
test_code_ids = set([d[0] for d in data[int(len(data)*.8):]])

In [None]:
t_intel = pd.read_sql('SELECT * FROM time WHERE kind_id=2', ut.create_connection()).groupby('code_id').mean()['cycle_count']
err = ((true_cycles - t_intel).abs() / true_cycles).dropna()
test_err = ((true_cycles - t_intel[t_intel.index.isin(test_code_ids)]).abs() / true_cycles).dropna()
print('Intel: {}/{} ({:.0f}%) blocks\nAverage error {}\n{}/{} ({:.0f}%) test blocks\nAverage error {}'.format(
    len(err), len(true_cycles.dropna()), 100 * (float(len(err)) / len(true_cycles.dropna())),
    err.mean(),
    len(test_err), len(test_code_ids), 100 * (float(len(test_err)) / len(test_code_ids)),
    test_err.mean()
))

In [None]:
t_llvm = pd.read_sql('SELECT * FROM time WHERE kind_id=3', ut.create_connection()).groupby('code_id').mean()['cycle_count']
err = ((true_cycles - t_llvm).abs() / true_cycles).dropna()
test_err = ((true_cycles - t_llvm[t_llvm.index.isin(test_code_ids)]).abs() / true_cycles).dropna()
print('LLVM: {}/{} ({:.0f}%) blocks\nAverage error {}\n{}/{} ({:.0f}%) test blocks\nAverage error {}'.format(
    len(err), len(true_cycles.dropna()), 100 * (float(len(err)) / len(true_cycles.dropna())),
    err.mean(),
    len(test_err), len(test_code_ids), 100 * (float(len(test_err)) / len(test_code_ids)),
    test_err.mean()
))

In [None]:
xs = np.linspace(1, 200, 200)
ys = [((true_cycles - x).abs() / true_cycles).mean() for x in xs]

In [None]:
plt.plot(xs, ys)
plt.xlabel('Constant value prediction')
plt.ylabel('Error')
plt.title('Constant Prediction Baseline')
plt.show()

In [None]:
code = pd.read_sql('SELECT DISTINCT(code.code_id), code_ir FROM code INNER JOIN time ON code.code_id=time.code_id WHERE LENGTH(code_raw) > 0', ut.create_connection()).set_index('code_id')

In [None]:
instrs = code['code_ir'].apply(re.compile('<opcode>(\d+)</opcode>').findall)
instrs = pd.DataFrame(instrs).join(true_cycles, how='inner')

In [None]:
d = {}
instr_tuples = []
for (_, r) in instrs.iterrows():
    opcs = []
    for op in r['code_ir']:
        if op not in d:
            d[op] = len(d)
        opcs.append(d[op])
    instr_tuples.append((opcs, r['cycle_count']))

In [None]:
d_rev = {v:k for (k, v) in d.items()}

In [None]:
idx_to_mapping = {}
_re = re.compile(r'^/\*\s*(\d+)\s*\*/\s*OP_(.*?),.*$')
with open('/home/ithemal/ithemal/common/inputs/encoding.h') as f:
    for l in f.readlines():
        match = _re.match(l)
        if match is None:
            continue
        idx_to_mapping[int(match.group(1))] = match.group(2)
def get_opc_of_w_idx(idx):
    return idx_to_mapping[int(d_rev[idx]) - 162 + 4]

In [None]:
arr = np.zeros((len(instr_tuples), len(d)), dtype=np.int)
for i, (opcs, _) in enumerate(instr_tuples):
    for opc in opcs:
        arr[i, opc] += 1
cost_ys = np.array([time for (_, time) in instr_tuples])
arr_mask = (arr > 0).astype(np.float32)

In [None]:
import cvxpy as cp
subset = random.sample(range(len(cost_ys)), min(len(cost_ys), len(cost_ys)))
w = cp.Variable(len(d))
preds = arr[subset] * w
errors = cp.abs(preds - cost_ys[subset]) / cost_ys[subset]
expr = cp.sum(errors) / len(subset)
prob = cp.Problem(cp.Minimize(expr), [w >= 0])

In [None]:
w.value = np.zeros(len(d)) + 35

In [None]:
%time prob.solve(solver='SCS', verbose=True, warm_start=True)

In [None]:
{get_opc_of_w_idx(i): int(w.value[i]) for i in range(len(d))}

In [None]:
prob._find_candidate_solvers()

In [None]:
import cvxopt.modeling
w = cvxopt.modeling.variable(len(d))
cys = cvxopt.matrix(cost_ys[subset])
preds = cvxopt.matrix(arr[subset], tc='d') * w
errors = [abs(preds - cys)[i] / cys[i] for i in range(len(subset))]
expr = sum(errors) / len(errors)

In [None]:
pr

In [None]:
def F(x=None, z=None):
    if x is None and z is None: return (len(d), np.zeros((len(d), 1)) + 33)
    if z is None:
        

In [None]:
prob = cvxopt.modeling.op(expr, [w >= 0])

In [None]:
prob.solve()

In [None]:
expr.value()[0]

In [None]:
errors[5].value()[0]

In [None]:
{get_opc_of_w_idx(i): w.value[i] for i in range(len(d))}

In [None]:
instrs

In [None]:
sorted(list(map(int, d.keys())))

In [None]:
class LinearModel(torch.nn.Module):
    def __init__(self, arr_xs, arr_mask, cost_ys):
        super(LinearModel, self).__init__()
        self.instr_costs = torch.nn.Parameter(torch.zeros(len(d) + 1, requires_grad=True) + 33)
        self.arr_xs = arr_xs
        self.arr_mask = arr_mask
        self.cost_ys = cost_ys

    def forward(self, idxs):
        xs = self.arr_xs[idxs]
        mask = self.arr_mask[idxs]
        ys = self.cost_ys[idxs]
        instr_costs = self.instr_costs[[xs]] * mask
        return torch.mean(torch.abs(instr_costs.sum(dim=1) - ys) / ys)

In [None]:
lm = LinearModel(arr, arr_mask, cost_ys)
lm(idxs)

In [None]:
idxs = list(range(len(cost_ys)))

def run_epoch(optimizer):
    random.shuffle(idxs)
    bsize = 1000
    for bidx in range(0, len(idxs) // bsize):
        optimizer.zero_grad()
        loss = lm(idxs[bidx*bsize:(bidx+1)*bsize])
        loss.backward()
        optimizer.step()

In [None]:
lm_opt = torch.optim.Adam(lm.parameters(), lr=3e-2)

In [None]:
for i in range(100):
    print('Epoch {}'.format(i + 1))
    run_epoch(lm_opt)
    if i % 5 == 0:
        print('Loss: {}'.format(lm(idxs).item()))

In [None]:
lm(idxs)

In [None]:
{k: lm.instr_costs[v] for (k, v) in d.items()}

In [None]:
def cost(xs):
    return np.mean(np.abs(xs[arr].sum(axis=1) - cost_ys) / cost_ys)

In [None]:
xs = np.linspace(30, 42, 13)
ys = [cost(np.array([0] + list(np.zeros(len(d)) + x))) for x in xs]

plt.plot(xs, ys)
plt.xlabel('Cycles per instruction')
plt.ylabel('Error')
plt.title('Constant Linear Baseline')
plt.show()

In [None]:
init = [0] + list(np.zeros(len(d)) + 37)
bounds = [(0, 0)] + [(0, 10000) for _ in range(len(d))]

In [None]:
res = opt.minimize(cost, init, bounds=bounds)

In [None]:
cost(res.x)

In [None]:
res

In [None]:
fake_xs = [1] * 32 + [2] * 16 + [8] * 2
fake_ys = [8] * 32 + [8] * 16 + [2] * 2
def foo(w):
    return sum(abs(w*x - y) / float(y) for (x, y) in zip(fake_xs, fake_ys))
foo_vec = np.vectorize(foo)

z = np.linspace(-5, 10, 100)
plt.plot(z, foo_vec(z))

In [None]:
import cvxopt

In [None]:
arr