In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "STREAM"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/STREAM/STREAM/stream_4034_generic2_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[336633.698810],04:01:00:854206,PF,251797631,18446744072449302655,4034
1,[336633.739463],04:01:00:894857,PF,1007005696,18446744072452043863,4034
2,[336633.867033],04:01:01:022428,PF,906555392,18446744072452043863,4034
3,[336633.901503],04:01:01:056898,PF,1914261504,18446744072452043863,4034
4,[336633.904886],04:01:01:060282,PF,259330048,18446744072452043863,4034
...,...,...,...,...,...,...
180309,[ 1806.225725],07:00:33:381360,PF,602140104,139790387006578,4034
180310,[ 1806.225827],07:00:33:381465,PF,597469140,18446744072443263295,4034
180311,[ 1806.240176],07:00:33:395810,PF,596315344,18446744072442219334,4034
180312,[ 1806.240198],07:00:33:395836,PF,597544452,18446744072441579461,4034


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,251797631,18446744072449302655
1,1007005696,18446744072452043863
2,906555392,18446744072452043863
3,1914261504,18446744072452043863
4,259330048,18446744072452043863
...,...,...
180309,602140104,139790387006578
180310,597469140,18446744072443263295
180311,596315344,18446744072442219334
180312,597544452,18446744072441579461


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

In [6]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):        
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
    
    def fit(self, X, y=None):
        if X.dtype == np.string_:
            self.oov_token = str(self.oov_token)

        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
    
        X[noise_index] = self.oov_token
        return self
    
    def transform(self, X, y=None):
        return X

    def inverse_transform(self, X, y=None):
        return X

In [7]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:  # X.dtype == |S{0~24}
            # As np.string_ type is byte type, not str(), need to be decoded.
            self.vocabulary = {X_counts.index[i].decode():i for i in range(self.vocab_size)}
        else:
            self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:
            self.oov_token = str(self.oov_token)
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i].decode()])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])
        else:
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i]])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [8]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [9]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [10]:
gpa_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

rip_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [11]:
processed_train_val_gpa = gpa_train_pipeline.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([     1,  55708, 156914, ...,  15814,   7211,      1])

In [12]:
processed_train_val_rip = rip_train_pipeline.fit_transform(train_val_rip)
processed_train_val_rip

array([3361,    0,    0, ...,    0,    0,    0])

In [13]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(0         713760
 1         643963
 2         405516
 3          36270
 4          21875
            ...  
 123691         3
 121644         3
 119597         3
 117550         3
 142076         3
 Length: 159264, dtype: int64,
 0       2591789
 1         46918
 2         34063
 3         32504
 4         28812
          ...   
 5785          3
 6777          3
 6778          3
 6779          3
 7051          3
 Length: 7240, dtype: int64)

In [31]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(3         58791
 4         27044
 5         13694
 6          7978
 7          5203
           ...  
 203           1
 229           1
 995           1
 158           1
 643963        1
 Length: 263, dtype: int64,
 3        1890
 4        1120
 5         703
 6         550
 7         439
          ... 
 2331        1
 290         1
 4392        1
 314         1
 10033       1
 Length: 374, dtype: int64)

## gpa : N / P ratio

In [79]:
# n : p ratio at threshold = 2
(processed_train_gpa.shape[0] + processed_val_gpa.shape[0]) / len(pd.Series(processed_train_val_gpa).value_counts())

22.18604329917621

In [72]:
gpa_threshold = 30

In [74]:
# p
len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values)

11225

In [75]:
# Upper Limit for trian/val accuracy
sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values * pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].index) / (processed_train_gpa.shape[0] + processed_val_gpa.shape[0])

0.28239295552943056

In [76]:
# n : p ratio at threshold = {threshold}
(processed_train_gpa.shape[0] + processed_val_gpa.shape[0]) / (len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values))

314.78289532293985

## rip : N / P ratio

In [80]:
# n : p ratio at threshold = 2
(processed_train_rip.shape[0] + processed_val_rip.shape[0]) / len(pd.Series(processed_train_val_rip).value_counts())

488.0439226519337

In [81]:
rip_threshold = 30

In [None]:
# p
len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values)

In [None]:
# Upper Limit for trian/val accuracy
sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values * pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].index) / (processed_train_gpa.shape[0] + processed_val_gpa.shape[0])

In [None]:
# n : p ratio at threshold = {threshold}
(processed_train_gpa.shape[0] + processed_val_gpa.shape[0]) / (len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts()[:gpa_threshold].values))

## Split Train / Val / Test set

In [15]:
processed_train_gpa, processed_val_gpa = train_test_split(processed_train_val_gpa, test_size=0.2, shuffle=False)
processed_train_rip, processed_val_rip = train_test_split(processed_train_val_rip, test_size=0.2, shuffle=False)

In [16]:
processed_train_gpa.shape, processed_val_gpa.shape, processed_train_rip.shape, processed_val_rip.shape  # check 

((2826750,), (706688,), (2826750,), (706688,))

In [17]:
train_gpa = train_val_gpa[:processed_train_gpa.shape[0]+1]
val_gpa = train_val_gpa[processed_train_gpa.shape[0]:]

train_rip = train_val_rip[:processed_train_rip.shape[0]+1]
val_rip = train_val_rip[processed_train_rip.shape[0]:]

In [18]:
train_gpa.shape, val_gpa.shape, train_rip.shape, val_rip.shape

((2826751,), (706689,), (2826751,), (706689,))

## Process Test Dataset

In [19]:
processed_test_gpa = gpa_train_pipeline.transform(test_gpa)
processed_test_gpa

array([7822, 2983,  771, ...,    1,    1,    1])

In [20]:
processed_test_rip = rip_train_pipeline.transform(test_rip)
processed_test_rip

array([0, 0, 0, ..., 1, 1, 1])

In [21]:
dict(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:10])

{'0': 0,
 '-1': 1,
 '18446744073709551579': 2,
 '37': 3,
 '18446744073709551612': 4,
 '4': 5,
 '18446744073709551587': 6,
 '29': 7,
 '18446650193870300916': 8,
 '93879839250700': 9}

## Concat GPA and RIP

In [22]:
train_set = np.c_[train_gpa, train_rip]
val_set = np.c_[val_gpa, val_rip]
test_set = np.c_[test_gpa, test_rip]

processed_train_set = np.c_[processed_train_gpa, processed_train_rip]
processed_val_set = np.c_[processed_val_gpa, processed_val_rip]
processed_test_set = np.c_[processed_test_gpa, processed_test_rip]
train_set, val_set, test_set

(array([[2.51797631e+08, 1.84467441e+19],
        [1.00700570e+09, 1.84467441e+19],
        [9.06555392e+08, 1.84467441e+19],
        ...,
        [1.47925811e+09, 1.84467441e+19],
        [1.47925811e+09, 1.84467441e+19],
        [1.25628826e+09, 9.41682691e+13]]),
 array([[1.25628826e+09, 9.41682691e+13],
        [1.48969882e+09, 9.41682691e+13],
        [1.48970291e+09, 9.41682691e+13],
        ...,
        [5.23194368e+08, 1.84467441e+19],
        [5.31795968e+08, 1.84467441e+19],
        [2.47767040e+07, 1.84467441e+19]]),
 array([[4.89844736e+08, 1.84467441e+19],
        [5.22874880e+08, 1.84467441e+19],
        [5.19757824e+08, 1.84467441e+19],
        ...,
        [5.96315344e+08, 1.84467441e+19],
        [5.97544452e+08, 1.84467441e+19],
        [6.03197568e+08, 1.39675253e+14]]))

In [23]:
train_set = pd.DataFrame(train_set, columns=["gpa", "rip"], index=None)
val_set = pd.DataFrame(val_set, columns=["gpa", "rip"], index=None)
test_set = pd.DataFrame(test_set, columns=["gpa", "rip"], index=None)

processed_train_set = pd.DataFrame(processed_train_set, columns=["gpa", "rip"], index=None)
processed_val_set = pd.DataFrame(processed_val_set, columns=["gpa", "rip"], index=None)
processed_test_set = pd.DataFrame(processed_test_set, columns=["gpa", "rip"], index=None)
train_set

Unnamed: 0,gpa,rip
0,2.517976e+08,1.844674e+19
1,1.007006e+09,1.844674e+19
2,9.065554e+08,1.844674e+19
3,1.914262e+09,1.844674e+19
4,2.593300e+08,1.844674e+19
...,...,...
2826746,1.495933e+09,9.416827e+13
2826747,1.282355e+09,9.416827e+13
2826748,1.479258e+09,1.844674e+19
2826749,1.479258e+09,1.844674e+19


## Save Processed Datasets

In [24]:
# Original 
train_set.to_csv("data/{}_train_set_original.csv".format(model_name), index=None)
val_set.to_csv("data/{}_val_set_original.csv".format(model_name), index=None)
test_set.to_csv("data/{}_test_set_original.csv".format(model_name), index=None)

# Processed 
processed_train_set.to_csv("data/{}_train_set.csv".format(model_name), index=None)
processed_val_set.to_csv("data/{}_val_set.csv".format(model_name), index=None)
processed_test_set.to_csv("data/{}_test_set.csv".format(model_name), index=None)

## Save Pipeline/Statics

In [25]:
with open("static/pipeline_gpa.pkl", 'wb') as f:
    dill.dump(gpa_train_pipeline, f)

with open("static/pipeline_rip.pkl", 'wb') as f:
    dill.dump(rip_train_pipeline, f)

In [26]:
np.savetxt("static/vocabulary_gpa.csv", np.array(list(gpa_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%d", delimiter="\n")

np.savetxt("static/vocabulary_rip.csv", np.array(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%s", delimiter="\n")

In [27]:
dict(list(gpa_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-4096: 0,
 -1: 1,
 0: 2,
 -12288: 3,
 -8192: 4,
 -643931498: 5,
 643931490: 6,
 643931494: 7,
 -643931494: 8,
 -643931490: 9,
 643931498: 10,
 333: 11,
 -24576: 12,
 -20480: 13,
 -16384: 14,
 -64: 15,
 -49: 16,
 -36864: 17,
 -28672: 18,
 -3872: 19}

In [28]:
dict(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{'0': 0,
 '-1': 1,
 '18446744073709551579': 2,
 '37': 3,
 '18446744073709551612': 4,
 '4': 5,
 '18446744073709551587': 6,
 '29': 7,
 '18446650193870300916': 8,
 '93879839250700': 9,
 '18446649955753248500': 10,
 '94117956303116': 11,
 '18446649904182912756': 12,
 '18446649249859774196': 13,
 '94169526638860': 14,
 '94823849777420': 15,
 '18446649929458829044': 16,
 '94144250722572': 17,
 '18446649957954295540': 18,
 '94115755256076': 19}

In [29]:
gpa_train_pipeline["sparse_category_encoder"].vocab_size, rip_train_pipeline["sparse_category_encoder"].vocab_size

(159264, 7240)