In [1]:
import numpy as np
import pandas as pd
import json
import dill

In [2]:
model_name = "GEMM_STREAM_VMID=17926"

In [3]:
train_set_original = pd.read_csv("data/{}_train_set_original.csv".format(model_name), dtype=np.object)
val_set_original = pd.read_csv("data/{}_val_set_original.csv".format(model_name), dtype=np.object)
test_set_original = pd.read_csv("data/{}_test_set_original.csv".format(model_name), dtype=np.object)

train_set = pd.read_csv("data/{}_train_set.csv".format(model_name))
val_set = pd.read_csv("data/{}_val_set.csv".format(model_name))
test_set = pd.read_csv("data/{}_test_set.csv".format(model_name))

In [4]:
with open("static/pipeline_gpa.pkl", "rb") as f:
    pipeline_gpa = dill.load(f)

with open("static/pipeline_rip.pkl", "rb") as f:
    pipeline_rip = dill.load(f)

## Load Vocabulary and Instantiate as dictionary : GPA

In [5]:
vocabulary_gpa = np.genfromtxt("static/vocabulary_gpa.csv", delimiter="\n", dtype=np.int64)
vocabulary_gpa

array([   -4096,       -1,        0, ...,  6201344, -6291456,  3141632],
      dtype=int64)

In [6]:
vocabulary_gpa = {v:i for i, v in enumerate(vocabulary_gpa)}

In [7]:
dict(list(vocabulary_gpa.items())[:10])

{-4096: 0,
 -1: 1,
 0: 2,
 -1638400: 3,
 -1896448: 4,
 -8192: 5,
 -4060: 6,
 6750208: 7,
 -864256: 8,
 -430892: 9}

## Demo : GPA

In [8]:
train_original_gpa_sample = train_set_original['gpa'].values[:30].astype(np.float64).astype(np.int64)
train_original_gpa_sample

array([17569096718, 22541336944, 17569096718, 22541336948, 17569096718,
       22541336952, 17569096718, 17569096385, 21576007680, 17569096385,
       17569096718, 22541336944, 17569096718, 22541336948, 17569096718,
       22541336952, 17569096718, 17569096385, 21576007680, 17569096385,
       17569096718, 22541336944, 17569096718, 22541336948, 17569096718,
       22541336952, 17569096718, 17569096385, 21576007680, 17569096385],
      dtype=int64)

In [9]:
processed_train_gpa_sample = pipeline_gpa.transform(train_original_gpa_sample)
processed_train_gpa_sample

array([ 19,  18,  16,  17,  15,  20,  21,   1,   1, 177,  19,  18,  16,
        17,  15,  20,  21,   1,   1, 177,  19,  18,  16,  17,  15,  20,
        21,   1,   1])

In [10]:
# Delta
train_gpa_sample = train_original_gpa_sample[:-1] - train_original_gpa_sample[1:]
train_gpa_sample

array([-4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333, -4006911295,  4006911295,        -333,
       -4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333, -4006911295,  4006911295,        -333,
       -4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333, -4006911295,  4006911295], dtype=int64)

In [11]:
# Note that Irreversible predictions are returned as -1.
train_gpa_sample_inverse_transformed = pipeline_gpa.inverse_transform(processed_train_gpa_sample)
train_gpa_sample_inverse_transformed

array([-4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333,          -1,          -1,        -333,
       -4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333,          -1,          -1,        -333,
       -4972240226,  4972240226, -4972240230,  4972240230, -4972240234,
        4972240234,         333,          -1,          -1], dtype=int64)

In [12]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[3], train_gpa_sample[3], train_gpa_sample_inverse_transformed[3]

(17, 4972240230, 4972240230)

In [13]:
vocabulary_gpa[-94208]

122

In [14]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[21], train_gpa_sample[21], train_gpa_sample_inverse_transformed[21]

(18, 4972240226, 4972240226)

In [15]:
vocabulary_gpa[-1]

1

## Interpretation
This indicates that 4712300544 is pruned and substituted to -1.  
Also, -1 is integer-indexed by 1 and this indicates that -1 is secondly frequent vocabulary. (begin with 0)

## Load Vocabulary and Instantiate as dictionary : RIP

In [16]:
vocabulary_rip = np.genfromtxt("static/vocabulary_rip.csv", delimiter="\n", dtype=np.uint64)
vocabulary_rip

array([                   0, 18446744073709551604, 18446744073709551605,
       ...,      140237724969129,               292896,
                     524368], dtype=uint64)

In [17]:
vocabulary_rip = {v:i for i, v in enumerate(vocabulary_rip)}

In [18]:
dict(list(vocabulary_rip.items())[:10])

{0: 0,
 18446744073709551604: 1,
 18446744073709551605: 2,
 35: 3,
 12: 4,
 18446744073709551581: 5,
 11: 6,
 23: 7,
 18446744073709551593: 8,
 18446744073709551592: 9}

## Demo : RIP

In [19]:
train_original_rip_sample = train_set_original['rip'].values[:30].astype(np.float64).astype(np.uint64)
train_original_rip_sample

array([18446744072500750336, 18446744072500750336, 18446744072500750336,
       18446744072500750336, 18446744072500750336, 18446744072500750336,
       18446744072500750336, 18446744072500748288, 18446744072500748288,
       18446744072500748288, 18446744072500750336, 18446744072500750336,
       18446744072500750336, 18446744072500750336, 18446744072500750336,
       18446744072500750336, 18446744072500750336, 18446744072500748288,
       18446744072500748288, 18446744072500748288, 18446744072500750336,
       18446744072500750336, 18446744072500750336, 18446744072500750336,
       18446744072500750336, 18446744072500750336, 18446744072500750336,
       18446744072500748288, 18446744072500748288, 18446744072500748288],
      dtype=uint64)

In [20]:
processed_train_rip_sample = pipeline_rip.transform(train_original_rip_sample)
processed_train_rip_sample

array([   0,    0,    0,    0,    0,    0,   11,    0,    0, 1325,    0,
          0,    0,    0,    0,    0,   11,    0,    0, 1325,    0,    0,
          0,    0,    0,    0,   11,    0,    0])

In [21]:
# Delta
train_rip_sample = train_original_rip_sample[:-1] - train_original_rip_sample[1:]
train_rip_sample

array([                   0,                    0,                    0,
                          0,                    0,                    0,
                       2048,                    0,                    0,
       18446744073709549568,                    0,                    0,
                          0,                    0,                    0,
                          0,                 2048,                    0,
                          0, 18446744073709549568,                    0,
                          0,                    0,                    0,
                          0,                    0,                 2048,
                          0,                    0], dtype=uint64)

In [22]:
# Note that Irreversible predictions are returned as -1.
train_rip_sample_inverse_transformed = pipeline_rip.inverse_transform(processed_train_rip_sample)
train_rip_sample_inverse_transformed = np.array([t.decode() for t in train_rip_sample_inverse_transformed]).astype(np.uint64)
train_rip_sample_inverse_transformed

array([                   0,                    0,                    0,
                          0,                    0,                    0,
       18446744073709551615,                    0,                    0,
       18446744073709549568,                    0,                    0,
                          0,                    0,                    0,
                          0, 18446744073709551615,                    0,
                          0, 18446744073709549568,                    0,
                          0,                    0,                    0,
                          0,                    0, 18446744073709551615,
                          0,                    0], dtype=uint64)

In [23]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[3], train_rip_sample[3], train_rip_sample_inverse_transformed[3]

(0, 0, 0)

In [24]:
vocabulary_rip[0]

0

In [25]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[21], train_rip_sample[21], train_rip_sample_inverse_transformed[21]

(0, 0, 0)

In [26]:
train_set['gpa'].value_counts().sum()

4955473

In [27]:
train_set['gpa'].value_counts().value_counts().sum()

3586