In [1]:
import numpy as np
import pandas as pd
import json
import dill

In [2]:
model_name = "GEMM_EX"

In [3]:
train_set_original = pd.read_csv("data/{}_train_set_original.csv".format(model_name), dtype=np.object)
val_set_original = pd.read_csv("data/{}_val_set_original.csv".format(model_name), dtype=np.object)
test_set_original = pd.read_csv("data/{}_test_set_original.csv".format(model_name), dtype=np.object)

train_set = pd.read_csv("data/{}_train_set.csv".format(model_name))
val_set = pd.read_csv("data/{}_val_set.csv".format(model_name))
test_set = pd.read_csv("data/{}_test_set.csv".format(model_name))

In [4]:
with open("static/pipeline_gpa.pkl", "rb") as f:
    pipeline_gpa = dill.load(f)

with open("static/pipeline_rip.pkl", "rb") as f:
    pipeline_rip = dill.load(f)

## Load Vocabulary and Instantiate as dictionary : GPA

In [5]:
vocabulary_gpa = np.genfromtxt("static/vocabulary_gpa.csv", delimiter="\n", dtype=np.int64)
vocabulary_gpa

array([      -4096,          -1,           0, ..., -4816109568,
         -28835840,   135725056], dtype=int64)

In [6]:
vocabulary_gpa = {v:i for i, v in enumerate(vocabulary_gpa)}

In [7]:
dict(list(vocabulary_gpa.items())[:10])

{-4096: 0,
 -1: 1,
 0: 2,
 -1638400: 3,
 -1896448: 4,
 -864256: 5,
 -8192: 6,
 -4060: 7,
 -13920476522: 8,
 13920476518: 9}

## Demo : GPA

In [8]:
train_original_gpa_sample = train_set_original['gpa'].values[:30].astype(np.float64).astype(np.int64)
train_original_gpa_sample

array([14190604288, 14352732392, 14121463808, 14120734720, 14120828928,
       19471331328, 14177644544, 14179282944, 17110061056, 17111699456,
       14228705280, 14239969280, 14241607680, 14243504128, 19440017408,
       18927992832, 18927996928, 18928001024, 18928005120, 18928009216,
       18928029696, 18928037888, 14215737344, 18962931712, 14169337856,
       14170099712, 14171131904, 14170525696, 14172348416, 14171140096],
      dtype=int64)

In [9]:
processed_train_gpa_sample = pipeline_gpa.transform(train_original_gpa_sample)
processed_train_gpa_sample

array([   1,    1,  547,   88,    1,    1,    3, 7256,    3, 5044, 3018,
          3,    4, 4836,    1,    0,    0,    0,    0,   97,    6,    1,
          1,    1,  580,  457,  365, 2576,  724])

In [10]:
# Delta
train_gpa_sample = train_original_gpa_sample[:-1] - train_original_gpa_sample[1:]
train_gpa_sample

array([ -162128104,   231268584,      729088,      -94208, -5350502400,
        5293686784,    -1638400, -2930778112,    -1638400,  2882994176,
         -11264000,    -1638400,    -1896448, -5196513280,   512024576,
             -4096,       -4096,       -4096,       -4096,      -20480,
             -8192,  4712300544, -4747194368,  4793593856,     -761856,
          -1032192,      606208,    -1822720,     1208320], dtype=int64)

In [11]:
# Note that Irreversible predictions are returned as -1.
train_gpa_sample_inverse_transformed = pipeline_gpa.inverse_transform(processed_train_gpa_sample)
train_gpa_sample_inverse_transformed

array([         -1,          -1,      729088,      -94208,          -1,
                -1,    -1638400, -2930778112,    -1638400,  2882994176,
         -11264000,    -1638400,    -1896448, -5196513280,          -1,
             -4096,       -4096,       -4096,       -4096,      -20480,
             -8192,          -1,          -1,          -1,     -761856,
          -1032192,      606208,    -1822720,     1208320], dtype=int64)

In [12]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[3], train_gpa_sample[3], train_gpa_sample_inverse_transformed[3]

(88, -94208, -94208)

In [13]:
vocabulary_gpa[-94208]

88

In [14]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[21], train_gpa_sample[21], train_gpa_sample_inverse_transformed[21]

(1, 4712300544, -1)

In [15]:
vocabulary_gpa[-1]

1

## Interpretation
This indicates that 4712300544 is pruned and substituted to -1.  
Also, -1 is integer-indexed by 1 and this indicates that -1 is secondly frequent vocabulary. (begin with 0)

## Load Vocabulary and Instantiate as dictionary : RIP

In [18]:
vocabulary_rip = np.genfromtxt("static/vocabulary_rip.csv", delimiter="\n", dtype=np.uint64)
vocabulary_rip

array([                   0, 18446744073709551604, 18446744073709551605,
       ..., 18446744071867908861,                 3936,
       18446744071878781462], dtype=uint64)

In [19]:
vocabulary_rip = {v:i for i, v in enumerate(vocabulary_rip)}

In [20]:
dict(list(vocabulary_rip.items())[:10])

{0: 0,
 18446744073709551604: 1,
 18446744073709551605: 2,
 35: 3,
 12: 4,
 11: 5,
 18446744073709551592: 6,
 23: 7,
 24: 8,
 18446744073709551593: 9}

## Demo : RIP

In [23]:
train_original_rip_sample = train_set_original['rip'].values[:30].astype(np.float64).astype(np.uint64)
train_original_rip_sample

array([       15410752, 140316942991300,        15410776,        15410764,
              15410764,        15410776,        15410752,        15410764,
              15410776,        15410741,        15410741,        15410752,
              15410764,        15410776,        15410776,        15410752,
              15410764,        15410776,        15410741,        15410752,
              15410776,        15410741,        15410776,        15410776,
              15410764,        15410741,        15410752,        15410741,
              15410741,        15410752], dtype=uint64)

In [24]:
processed_train_rip_sample = pipeline_rip.transform(train_original_rip_sample)
processed_train_rip_sample

array([13, 13,  4,  0,  1,  8,  1,  1,  3,  0,  2,  1,  1,  0,  8,  1,  1,
        3,  2,  6,  3, 10,  0,  4,  7,  2,  5,  0,  2])

In [25]:
# Delta
train_rip_sample = train_original_rip_sample[:-1] - train_original_rip_sample[1:]
train_rip_sample

array([18446603756781971068,      140316927580524,                   12,
                          0, 18446744073709551604,                   24,
       18446744073709551604, 18446744073709551604,                   35,
                          0, 18446744073709551605, 18446744073709551604,
       18446744073709551604,                    0,                   24,
       18446744073709551604, 18446744073709551604,                   35,
       18446744073709551605, 18446744073709551592,                   35,
       18446744073709551581,                    0,                   12,
                         23, 18446744073709551605,                   11,
                          0, 18446744073709551605], dtype=uint64)

In [35]:
# Note that Irreversible predictions are returned as -1.
train_rip_sample_inverse_transformed = pipeline_rip.inverse_transform(processed_train_rip_sample)
train_rip_sample_inverse_transformed = np.array([t.decode() for t in train_rip_sample_inverse_transformed]).astype(np.uint64)
train_rip_sample_inverse_transformed

array([18446744073709551615, 18446744073709551615,                   12,
                          0, 18446744073709551604,                   24,
       18446744073709551604, 18446744073709551604,                   35,
                          0, 18446744073709551605, 18446744073709551604,
       18446744073709551604,                    0,                   24,
       18446744073709551604, 18446744073709551604,                   35,
       18446744073709551605, 18446744073709551592,                   35,
       18446744073709551581,                    0,                   12,
                         23, 18446744073709551605,                   11,
                          0, 18446744073709551605], dtype=uint64)

In [36]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[3], train_rip_sample[3], train_rip_sample_inverse_transformed[3]

(0, 0, 0)

In [37]:
vocabulary_rip[0]

0

In [38]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[21], train_rip_sample[21], train_rip_sample_inverse_transformed[21]

(10, 18446744073709551581, 18446744073709551581)

In [39]:
vocabulary_rip[18446744073709551581]

10