In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_STREAM"

## Set Thresholds

In [3]:
gpa_threshold = 48
rip_threshold = 3

## Load Dataset/Static Param List

In [4]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM_STREAM/GEMM_STREAM/gemm_stream_generic_generic2_{}.csv".format(i),
dtype=np.object) for i in range(1, 10)], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[861367.505406],05:46:34:647695,PF,988827648,18446744072452043863,4034
1,[861367.505455],05:46:34:647744,PF,705122304,18446744072452043863,4034
2,[861367.505459],05:46:34:647748,PF,705122304,18446744072452043863,4034
3,[861367.505464],05:46:34:647754,PF,1227321344,18446744072452043863,4034
4,[861367.505468],05:46:34:647757,PF,1227321344,18446744072452043863,4034
...,...,...,...,...,...,...
407242,[957446.819051],08:27:53:959834,PF,20453990400,18446744072506569815,17926
407243,[957446.865342],08:27:54:006124,PF,20453994496,18446744072506569815,17926
407244,[957446.865354],08:27:54:006138,PF,20453998592,18446744072506569815,17926
407245,[957446.865359],08:27:54:006143,PF,20454002688,18446744072506569815,17926


In [5]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,988827648,18446744072452043863
1,705122304,18446744072452043863
2,705122304,18446744072452043863
3,1227321344,18446744072452043863
4,1227321344,18446744072452043863
...,...,...
407242,20453990400,18446744072506569815
407243,20453994496,18446744072506569815
407244,20453998592,18446744072506569815
407245,20454002688,18446744072506569815


## Dataset Processing Functions

In [6]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

In [7]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):        
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
    
    def fit(self, X, y=None):
        if X.dtype == np.string_:
            self.oov_token = str(self.oov_token)

        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
    
        X[noise_index] = self.oov_token
        return self
    
    def transform(self, X, y=None):
        return X

    def inverse_transform(self, X, y=None):
        return X

In [8]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:  # X.dtype == |S{0~24}
            # As np.string_ type is byte type, not str(), need to be decoded.
            self.vocabulary = {X_counts.index[i].decode():i for i in range(self.vocab_size)}
        else:
            self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:
            self.oov_token = str(self.oov_token)
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i].decode()])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])
        else:
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i]])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [9]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [10]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [11]:
gpa_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer(minimum_category_occurence=gpa_threshold)),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

rip_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer(minimum_category_occurence=rip_threshold)),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [12]:
processed_train_val_gpa = gpa_train_pipeline.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([1, 2, 1, ..., 1, 2, 1])

In [13]:
processed_train_val_rip = rip_train_pipeline.fit_transform(train_val_rip.copy())
processed_train_val_rip

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(0       2279684
 1       2099861
 2       1414252
 3        169636
 4         72617
          ...   
 3267         49
 3264         49
 3265         49
 3262         49
 3324         49
 Length: 3325, dtype: int64,
 0       5424746
 1        241034
 2        131801
 3        114156
 4         85491
          ...   
 7239          4
 6416          4
 7680          4
 6417          4
 7723          4
 Length: 7823, dtype: int64)

In [15]:
pd.Series(processed_train_val_gpa).value_counts().value_counts()[:10], pd.Series(processed_train_val_rip).value_counts().value_counts()[:10]

(49    84
 52    70
 51    68
 50    66
 55    65
 57    62
 54    62
 59    54
 61    53
 53    50
 dtype: int64,
 4     1480
 5      983
 6      699
 7      487
 8      374
 9      310
 10     253
 11     194
 12     180
 13     173
 dtype: int64)

In [16]:
processed_train_gpa, processed_val_gpa = train_test_split(processed_train_val_gpa, test_size=0.2, shuffle=False)
processed_train_rip, processed_val_rip = train_test_split(processed_train_val_rip, test_size=0.2, shuffle=False)

In [17]:
processed_train_gpa.shape, processed_val_gpa.shape, processed_train_rip.shape, processed_val_rip.shape  # check 

((6209488,), (1552373,), (6209488,), (1552373,))

In [18]:
train_gpa = train_val_gpa[:processed_train_gpa.shape[0]+1]
val_gpa = train_val_gpa[processed_train_gpa.shape[0]:]

train_rip = train_val_rip[:processed_train_rip.shape[0]+1]
val_rip = train_val_rip[processed_train_rip.shape[0]:]

In [19]:
train_gpa.shape, val_gpa.shape, train_rip.shape, val_rip.shape

((6209489,), (1552374,), (6209489,), (1552374,))

## Process Test Dataset

In [20]:
processed_test_gpa = gpa_train_pipeline.transform(test_gpa)
processed_test_gpa

array([1, 2, 1, ..., 0, 0, 0])

In [21]:
processed_test_rip = rip_train_pipeline.transform(test_rip)
processed_test_rip

array([  0,   0, 123, ...,   0,   0,   0])

In [22]:
dict(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:10])

{'0': 0,
 '18446744073709551604': 1,
 '18446744073709551605': 2,
 '35': 3,
 '12': 4,
 '-1': 5,
 '18446744073709551581': 6,
 '11': 7,
 '23': 8,
 '18446744073709551593': 9}

## Concat GPA and RIP

In [23]:
train_set = np.c_[train_gpa, train_rip]
val_set = np.c_[val_gpa, val_rip]
test_set = np.c_[test_gpa, test_rip]

processed_train_set = np.c_[processed_train_gpa, processed_train_rip]
processed_val_set = np.c_[processed_val_gpa, processed_val_rip]
processed_test_set = np.c_[processed_test_gpa, processed_test_rip]
train_set, val_set, test_set

(array([[9.88827648e+08, 1.84467441e+19],
        [7.05122304e+08, 1.84467441e+19],
        [7.05122304e+08, 1.84467441e+19],
        ...,
        [1.17589443e+10, 1.84467441e+19],
        [1.17589443e+10, 1.84467441e+19],
        [1.17589484e+10, 1.84467441e+19]]),
 array([[1.17589484e+10, 1.84467441e+19],
        [1.17589484e+10, 1.84467441e+19],
        [1.17589524e+10, 1.84467441e+19],
        ...,
        [1.69907814e+09, 1.84467441e+19],
        [1.69907814e+09, 1.84467441e+19],
        [1.84251187e+09, 1.84467441e+19]]),
 array([[1.84251187e+09, 1.84467441e+19],
        [1.74567834e+09, 1.84467441e+19],
        [1.74567834e+09, 1.84467441e+19],
        ...,
        [2.04539986e+10, 1.84467441e+19],
        [2.04540027e+10, 1.84467441e+19],
        [2.04540068e+10, 1.84467441e+19]]))

In [24]:
train_set = pd.DataFrame(train_set, columns=["gpa", "rip"], index=None)
val_set = pd.DataFrame(val_set, columns=["gpa", "rip"], index=None)
test_set = pd.DataFrame(test_set, columns=["gpa", "rip"], index=None)

processed_train_set = pd.DataFrame(processed_train_set, columns=["gpa", "rip"], index=None)
processed_val_set = pd.DataFrame(processed_val_set, columns=["gpa", "rip"], index=None)
processed_test_set = pd.DataFrame(processed_test_set, columns=["gpa", "rip"], index=None)
train_set

Unnamed: 0,gpa,rip
0,9.888276e+08,1.844674e+19
1,7.051223e+08,1.844674e+19
2,7.051223e+08,1.844674e+19
3,1.227321e+09,1.844674e+19
4,1.227321e+09,1.844674e+19
...,...,...
6209484,1.175894e+10,1.844674e+19
6209485,1.175894e+10,1.844674e+19
6209486,1.175894e+10,1.844674e+19
6209487,1.175894e+10,1.844674e+19


## Save Processed Datasets

In [25]:
# Original 
train_set.to_csv("data/{}_train_set_original.csv".format(model_name), index=None)
val_set.to_csv("data/{}_val_set_original.csv".format(model_name), index=None)
test_set.to_csv("data/{}_test_set_original.csv".format(model_name), index=None)

# Processed 
processed_train_set.to_csv("data/{}_train_set.csv".format(model_name), index=None)
processed_val_set.to_csv("data/{}_val_set.csv".format(model_name), index=None)
processed_test_set.to_csv("data/{}_test_set.csv".format(model_name), index=None)

## Save Pipeline/Statics

In [26]:
with open("static/pipeline_gpa.pkl", 'wb') as f:
    dill.dump(gpa_train_pipeline, f)

with open("static/pipeline_rip.pkl", 'wb') as f:
    dill.dump(rip_train_pipeline, f)

In [27]:
np.savetxt("static/vocabulary_gpa.csv", np.array(list(gpa_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%d", delimiter="\n")

np.savetxt("static/vocabulary_rip.csv", np.array(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%s", delimiter="\n")

In [28]:
dict(list(gpa_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-4096: 0,
 -1: 1,
 0: 2,
 -1638400: 3,
 -1896448: 4,
 -8192: 5,
 -4060: 6,
 6750208: 7,
 -12288: 8,
 333: 9,
 -864256: 10,
 -16384: 11,
 -430892: 12,
 -172844: 13,
 6492160: 14,
 -1380352: 15,
 -4972240234: 16,
 -4972240230: 17,
 4972240230: 18,
 4972240226: 19}

In [29]:
dict(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{'0': 0,
 '18446744073709551604': 1,
 '18446744073709551605': 2,
 '35': 3,
 '12': 4,
 '-1': 5,
 '18446744073709551581': 6,
 '11': 7,
 '23': 8,
 '18446744073709551593': 9,
 '18446744073709551592': 10,
 '24': 11,
 '54525952': 12,
 '18446744073655025664': 13,
 '18446744073709551555': 14,
 '18446744073709551545': 15,
 '18446744073709551551': 16,
 '18446744073709551548': 17,
 '18446744073709551544': 18,
 '18446744073709551558': 19}

In [30]:
gpa_train_pipeline["sparse_category_encoder"].vocab_size, rip_train_pipeline["sparse_category_encoder"].vocab_size

(3325, 7823)