In [1]:
import numpy as np
import pandas as pd
import json
import dill

In [2]:
model_name = "GEMM_STREAM"

In [3]:
train_set_original = pd.read_csv("data/{}_train_set_original.csv".format(model_name), dtype=np.object)
val_set_original = pd.read_csv("data/{}_val_set_original.csv".format(model_name), dtype=np.object)
test_set_original = pd.read_csv("data/{}_test_set_original.csv".format(model_name), dtype=np.object)

train_set = pd.read_csv("data/{}_train_set.csv".format(model_name))
val_set = pd.read_csv("data/{}_val_set.csv".format(model_name))
test_set = pd.read_csv("data/{}_test_set.csv".format(model_name))

In [4]:
with open("static/pipeline_gpa.pkl", "rb") as f:
    pipeline_gpa = dill.load(f)

with open("static/pipeline_rip.pkl", "rb") as f:
    pipeline_rip = dill.load(f)

## Load Vocabulary and Instantiate as dictionary : GPA

In [5]:
vocabulary_gpa = np.genfromtxt("static/vocabulary_gpa.csv", delimiter="\n", dtype=np.int64)
vocabulary_gpa

array([     -4096,          0,         -1, ...,  157474816, 1917661184,
       -726827008], dtype=int64)

In [6]:
vocabulary_gpa = {v:i for i, v in enumerate(vocabulary_gpa)}

In [7]:
dict(list(vocabulary_gpa.items())[:10])

{-4096: 0,
 0: 1,
 -1: 2,
 -1638400: 3,
 -1896448: 4,
 -8192: 5,
 -4060: 6,
 6750208: 7,
 -12288: 8,
 333: 9}

## Demo : GPA

In [8]:
train_original_gpa_sample = train_set_original['gpa'].values[:30].astype(np.float64).astype(np.int64)
train_original_gpa_sample

array([ 988827648,  705122304,  705122304, 1227321344, 1227321344,
       1884864512, 1884864512,  576954368,  576954368,  714526720,
        714526720,  110542848,  110542848, 1124007936, 1124007936,
        337944576,  337944576,  718028800,  718028800, 1236467712,
       1236467712, 1564139520, 1656479744, 1181954048, 1181954048,
       1826820096, 1826820096, 1560858624, 1560858624, 1440399360],
      dtype=int64)

In [9]:
processed_train_gpa_sample = pipeline_gpa.transform(train_original_gpa_sample)
processed_train_gpa_sample

array([     2,      1,      2,      1,      2,      1, 121120,      1,
       130903,      1,      2,      1,      2,      1,      2,      1,
            2,      1,      2,      1,      2,  68102,  18498,      1,
            2,      1,      2,      1,  69035])

In [10]:
# Delta
train_gpa_sample = train_original_gpa_sample[:-1] - train_original_gpa_sample[1:]
train_gpa_sample

array([  283705344,           0,  -522199040,           0,  -657543168,
                 0,  1307910144,           0,  -137572352,           0,
         603983872,           0, -1013465088,           0,   786063360,
                 0,  -380084224,           0,  -518438912,           0,
        -327671808,   -92340224,   474525696,           0,  -644866048,
                 0,   265961472,           0,   120459264], dtype=int64)

In [11]:
# Note that Irreversible predictions are returned as -1.
train_gpa_sample_inverse_transformed = pipeline_gpa.inverse_transform(processed_train_gpa_sample)
train_gpa_sample_inverse_transformed

array([        -1,          0,         -1,          0,         -1,
                0, 1307910144,          0, -137572352,          0,
               -1,          0,         -1,          0,         -1,
                0,         -1,          0,         -1,          0,
               -1,  -92340224,  474525696,          0,         -1,
                0,         -1,          0,  120459264], dtype=int64)

In [12]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[3], train_gpa_sample[3], train_gpa_sample_inverse_transformed[3]

(1, 0, 0)

In [13]:
vocabulary_gpa[-94208]

128

In [14]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[21], train_gpa_sample[21], train_gpa_sample_inverse_transformed[21]

(68102, -92340224, -92340224)

In [15]:
vocabulary_gpa[-1]

2

## Interpretation
This indicates that 4712300544 is pruned and substituted to -1.  
Also, -1 is integer-indexed by 1 and this indicates that -1 is secondly frequent vocabulary. (begin with 0)

## Load Vocabulary and Instantiate as dictionary : RIP

In [16]:
vocabulary_rip = np.genfromtxt("static/vocabulary_rip.csv", delimiter="\n", dtype=np.uint64)
vocabulary_rip

array([                   0, 18446744073709551604, 18446744073709551605,
       ..., 18446649486043795977,              6396599,
            140024691984546], dtype=uint64)

In [17]:
vocabulary_rip = {v:i for i, v in enumerate(vocabulary_rip)}

In [18]:
dict(list(vocabulary_rip.items())[:10])

{0: 0,
 18446744073709551604: 1,
 18446744073709551605: 2,
 35: 3,
 12: 4,
 18446744073709551615: 5,
 18446744073709551581: 6,
 11: 7,
 23: 8,
 18446744073709551593: 9}

## Demo : RIP

In [19]:
train_original_rip_sample = train_set_original['rip'].values[:30].astype(np.float64).astype(np.uint64)
train_original_rip_sample

array([18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800],
      dtype=uint64)

In [20]:
processed_train_rip_sample = pipeline_rip.transform(train_original_rip_sample)
processed_train_rip_sample

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [21]:
# Delta
train_rip_sample = train_original_rip_sample[:-1] - train_original_rip_sample[1:]
train_rip_sample

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=uint64)

In [22]:
# Note that Irreversible predictions are returned as -1.
train_rip_sample_inverse_transformed = pipeline_rip.inverse_transform(processed_train_rip_sample)
train_rip_sample_inverse_transformed = np.array([t.decode() for t in train_rip_sample_inverse_transformed]).astype(np.uint64)
train_rip_sample_inverse_transformed

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=uint64)

In [23]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[3], train_rip_sample[3], train_rip_sample_inverse_transformed[3]

(0, 0, 0)

In [24]:
vocabulary_rip[0]

0

In [25]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[21], train_rip_sample[21], train_rip_sample_inverse_transformed[21]

(0, 0, 0)

In [26]:
vocabulary_rip[18446744073709551581]

6

## Minimum Category Occurence (=Threshold)

In [33]:
pipeline_gpa["noise_tokenizer"].minimum_category_occurence, pipeline_rip["noise_tokenizer"].minimum_category_occurence

2