In [1]:
import numpy as np
import pandas as pd
import json
import dill

In [2]:
model_name = "STREAM"

In [3]:
train_set_original = pd.read_csv("data/{}_train_set_original.csv".format(model_name), dtype=np.object)
val_set_original = pd.read_csv("data/{}_val_set_original.csv".format(model_name), dtype=np.object)
test_set_original = pd.read_csv("data/{}_test_set_original.csv".format(model_name), dtype=np.object)

train_set = pd.read_csv("data/{}_train_set.csv".format(model_name))
val_set = pd.read_csv("data/{}_val_set.csv".format(model_name))
test_set = pd.read_csv("data/{}_test_set.csv".format(model_name))

In [4]:
with open("static/pipeline_gpa.pkl", "rb") as f:
    pipeline_gpa = dill.load(f)

with open("static/pipeline_rip.pkl", "rb") as f:
    pipeline_rip = dill.load(f)

## Load Vocabulary and Instantiate as dictionary : GPA

In [5]:
vocabulary_gpa = np.genfromtxt("static/vocabulary_gpa.csv", delimiter="\n", dtype=np.int64)
vocabulary_gpa

array([        -1,      -4096,          0, ...,  133824512, -189952000,
       -114376704], dtype=int64)

In [6]:
vocabulary_gpa = {v:i for i, v in enumerate(vocabulary_gpa)}

In [7]:
dict(list(vocabulary_gpa.items())[:10])

{-1: 0,
 -4096: 1,
 0: 2,
 -12288: 3,
 -8192: 4,
 -643931498: 5,
 643931490: 6,
 -643931494: 7,
 643931494: 8,
 -643931490: 9}

## Demo : GPA

In [8]:
train_original_gpa_sample = train_set_original['gpa'].values[:30].astype(np.float64).astype(np.int64)
train_original_gpa_sample

array([ 251797631, 1007005696,  906555392, 1914261504,  259330048,
        971534336, 1067118592,  914223104, 1067278336, 1055367168,
       1055371264, 1066016768, 1066020864,  970022912,  970027008,
        900882432,  900886528,  248718350,  892649840,  248718350,
        892649844,  248718350,  892649848,  248718350,  248718017,
       1928314880,  248718017,  954650624, 1067204608, 1067130880],
      dtype=int64)

In [9]:
processed_train_gpa_sample = pipeline_gpa.transform(train_original_gpa_sample)
processed_train_gpa_sample

array([    0, 55123,     0,     0,     0, 31648, 51125,     0,  5858,
           1,  6541,     1,     0,     1, 27651,     1,     0,     9,
           6,     7,     8,     5,    10,    11,     0,     0,     0,
       30284,    58])

In [10]:
# Delta
train_gpa_sample = train_original_gpa_sample[:-1] - train_original_gpa_sample[1:]
train_gpa_sample

array([ -755208065,   100450304, -1007706112,  1654931456,  -712204288,
         -95584256,   152895488,  -153055232,    11911168,       -4096,
         -10645504,       -4096,    95997952,       -4096,    69144576,
             -4096,   652168178,  -643931490,   643931490,  -643931494,
         643931494,  -643931498,   643931498,         333, -1679596863,
        1679596863,  -705932607,  -112553984,       73728], dtype=int64)

In [11]:
# Note that Irreversible predictions are returned as -1.
train_gpa_sample_inverse_transformed = pipeline_gpa.inverse_transform(processed_train_gpa_sample)
train_gpa_sample_inverse_transformed

array([        -1,  100450304,         -1,         -1,         -1,
        -95584256,  152895488,         -1,   11911168,      -4096,
        -10645504,      -4096,         -1,      -4096,   69144576,
            -4096,         -1, -643931490,  643931490, -643931494,
        643931494, -643931498,  643931498,        333,         -1,
               -1,         -1, -112553984,      73728], dtype=int64)

In [12]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[3], train_gpa_sample[3], train_gpa_sample_inverse_transformed[3]

(0, 1654931456, -1)

In [13]:
vocabulary_gpa[-94208]

55

In [14]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_gpa_sample[21], train_gpa_sample[21], train_gpa_sample_inverse_transformed[21]

(5, -643931498, -643931498)

In [15]:
vocabulary_gpa[-1]

0

## Interpretation
This indicates that 4712300544 is pruned and substituted to -1.  
Also, -1 is integer-indexed by 1 and this indicates that -1 is secondly frequent vocabulary. (begin with 0)

## Load Vocabulary and Instantiate as dictionary : RIP

In [16]:
vocabulary_rip = np.genfromtxt("static/vocabulary_rip.csv", delimiter="\n", dtype=np.uint64)
vocabulary_rip

array([                   0, 18446744073709551615, 18446744073709551579,
       ...,      140268567846048, 18446744073703864254,
             94783653750601], dtype=uint64)

In [17]:
vocabulary_rip = {v:i for i, v in enumerate(vocabulary_rip)}

In [18]:
dict(list(vocabulary_rip.items())[:10])

{0: 0,
 18446744073709551615: 1,
 18446744073709551579: 2,
 37: 3,
 18446744073709551612: 4,
 4: 5,
 18446744073709551587: 6,
 29: 7,
 18446650193870300916: 8,
 93879839250700: 9}

## Demo : RIP

In [19]:
train_original_rip_sample = train_set_original['rip'].values[:30].astype(np.float64).astype(np.uint64)
train_original_rip_sample

array([18446744072449302528, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072452044800,
       18446744072452044800, 18446744072452044800, 18446744072446224384,
       18446744072446224384, 18446744072446224384, 18446744072446224384,
       18446744072446224384, 18446744072446224384, 18446744072446224384,
       18446744072446222336, 18446744072446222336, 18446744072446222336,
       18446744072452044800, 18446744072452044800,       94142993216960],
      dtype=uint64)

In [20]:
processed_train_rip_sample = pipeline_rip.transform(train_original_rip_sample)
processed_train_rip_sample

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1])

In [21]:
# Delta
train_rip_sample = train_original_rip_sample[:-1] - train_original_rip_sample[1:]
train_rip_sample

array([18446744073706809344,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0,              5820416,                    0,
                          0,                    0,                    0,
                          0,                    0,                 2048,
                          0,                    0, 18446744073703729152,
                          0, 18446649929458827840], dtype=uint64)

In [22]:
# Note that Irreversible predictions are returned as -1.
train_rip_sample_inverse_transformed = pipeline_rip.inverse_transform(processed_train_rip_sample)
train_rip_sample_inverse_transformed = np.array([t.decode() for t in train_rip_sample_inverse_transformed]).astype(np.uint64)
train_rip_sample_inverse_transformed

array([18446744073709551615,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0,                    0,                    0,
                          0, 18446744073709551615,                    0,
                          0,                    0,                    0,
                          0,                    0, 18446744073709551615,
                          0,                    0, 18446744073709551615,
                          0, 18446744073709551615], dtype=uint64)

In [23]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[3], train_rip_sample[3], train_rip_sample_inverse_transformed[3]

(0, 0, 0)

In [24]:
vocabulary_rip[0]

0

In [25]:
# Processed / Original Delta / Inverse_transformed (= restored from prediction)
processed_train_rip_sample[21], train_rip_sample[21], train_rip_sample_inverse_transformed[21]

(0, 0, 0)

In [26]:
vocabulary_rip[18446744073709551581]

KeyError: 18446744073709551581