In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_EX"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM/gem_3214_generic_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[601757.292075],06:00:51:525554,PF,14190604288,15410752,3214
1,[601757.297013],06:00:51:530492,PF,14352732392,140316942991300,3214
2,[601757.337978],06:00:51:571456,PF,14121463808,15410776,3214
3,[601757.368684],06:00:51:602162,PF,14120734720,15410764,3214
4,[601757.376568],06:00:51:610048,PF,14120828928,15410764,3214
...,...,...,...,...,...,...
400591,[ 1946.373117],07:24:00:468967,GPA,8620549134,18446744071888380942,3214
400592,[ 1946.373123],07:24:00:468972,GPA,22541025656,18446744071888380942,3214
400593,[ 1946.373157],07:24:00:469006,GPA,8620549134,18446744071888380942,3214
400594,[ 1946.373422],07:24:00:469266,GPA,8620548801,18446744071888380609,3214


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,14190604288,15410752
1,14352732392,140316942991300
2,14121463808,15410776
3,14120734720,15410764
4,14120828928,15410764
...,...,...
400591,8620549134,18446744071888380942
400592,22541025656,18446744071888380942
400593,8620549134,18446744071888380942
400594,8620548801,18446744071888380609


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

In [6]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):        
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
    
    def fit(self, X, y=None):
        if X.dtype == np.string_:
            self.oov_token = str(self.oov_token)

        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
    
        X[noise_index] = self.oov_token
        return self
    
    def transform(self, X, y=None):
        return X

    def inverse_transform(self, X, y=None):
        return X

In [7]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:  # X.dtype == |S{0~24}
            # As np.string_ type is byte type, not str(), need to be decoded.
            self.vocabulary = {X_counts.index[i].decode():i for i in range(self.vocab_size)}
        else:
            self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:
            self.oov_token = str(self.oov_token)
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i].decode()])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])
        else:
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i]])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [8]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [9]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [10]:
GEMM_EX_gpa_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

GEMM_EX_rip_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [11]:
processed_train_val_gpa = GEMM_EX_gpa_train_pipeline.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([  1,   1, 547, ...,   0,   0,   0])

In [12]:
processed_train_val_rip = GEMM_EX_rip_train_pipeline.fit_transform(train_val_rip)
processed_train_val_rip

array([13, 13,  4, ...,  0,  0,  0])

In [13]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(0        538742
 1        239488
 2        177665
 3         84171
 4         37248
           ...  
 25840         3
 30315         3
 24174         3
 29288         3
 29173         3
 Length: 33476, dtype: int64,
 0       1189566
 1         99667
 2         54664
 3         48114
 4         34376
          ...   
 1559          3
 1763          3
 1671          3
 1762          3
 1615          3
 Length: 1883, dtype: int64)

In [14]:
processed_train_gpa, processed_val_gpa = train_test_split(processed_train_val_gpa, test_size=0.2, shuffle=False)
processed_train_rip, processed_val_rip = train_test_split(processed_train_val_rip, test_size=0.2, shuffle=False)

In [15]:
processed_train_gpa.shape, processed_val_gpa.shape, processed_train_rip.shape, processed_val_rip.shape  # check 

((1586349,), (396588,), (1586349,), (396588,))

In [16]:
train_gpa = train_val_gpa[:processed_train_gpa.shape[0]+1]
val_gpa = train_val_gpa[processed_train_gpa.shape[0]:]

train_rip = train_val_rip[:processed_train_rip.shape[0]+1]
val_rip = train_val_rip[processed_train_rip.shape[0]:]

In [17]:
train_gpa.shape, val_gpa.shape, train_rip.shape, val_rip.shape

((1586350,), (396589,), (1586350,), (396589,))

## Process Test Dataset

In [18]:
processed_test_gpa = GEMM_EX_gpa_train_pipeline.transform(test_gpa)
processed_test_gpa

array([ 0,  1,  1, ..., 13, 14,  1])

In [19]:
processed_test_rip = GEMM_EX_rip_train_pipeline.transform(test_rip)
processed_test_rip

array([ 0,  0,  0, ...,  0, 18,  0])

In [20]:
dict(list(GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:10])

{'0': 0,
 '18446744073709551604': 1,
 '18446744073709551605': 2,
 '35': 3,
 '12': 4,
 '11': 5,
 '18446744073709551592': 6,
 '23': 7,
 '24': 8,
 '18446744073709551593': 9}

## Concat GPA and RIP

In [21]:
train_set = np.c_[train_gpa, train_rip]
val_set = np.c_[val_gpa, val_rip]
test_set = np.c_[test_gpa, test_rip]

processed_train_set = np.c_[processed_train_gpa, processed_train_rip]
processed_val_set = np.c_[processed_val_gpa, processed_val_rip]
processed_test_set = np.c_[processed_test_gpa, processed_test_rip]
train_set, val_set, test_set

(array([[1.41906043e+10, 1.54107520e+07],
        [1.43527324e+10, 1.40316943e+14],
        [1.41214638e+10, 1.54107760e+07],
        ...,
        [1.29835213e+10, 2.04717590e+07],
        [1.43266939e+10, 2.04717860e+07],
        [1.43266939e+10, 2.04718200e+07]]),
 array([[1.43266939e+10, 2.04718200e+07],
        [1.43266939e+10, 2.04718200e+07],
        [1.36975770e+10, 2.04717480e+07],
        ...,
        [2.06899282e+10, 1.84467441e+19],
        [2.06899323e+10, 1.84467441e+19],
        [2.06899364e+10, 1.84467441e+19]]),
 array([[2.06899405e+10, 1.84467441e+19],
        [2.06899446e+10, 1.84467441e+19],
        [2.08139059e+10, 1.84467441e+19],
        ...,
        [8.62054913e+09, 1.84467441e+19],
        [8.62054880e+09, 1.84467441e+19],
        [2.26381455e+10, 1.84467441e+19]]))

In [22]:
train_set = pd.DataFrame(train_set, columns=["gpa", "rip"], index=None)
val_set = pd.DataFrame(val_set, columns=["gpa", "rip"], index=None)
test_set = pd.DataFrame(test_set, columns=["gpa", "rip"], index=None)

processed_train_set = pd.DataFrame(processed_train_set, columns=["gpa", "rip"], index=None)
processed_val_set = pd.DataFrame(processed_val_set, columns=["gpa", "rip"], index=None)
processed_test_set = pd.DataFrame(processed_test_set, columns=["gpa", "rip"], index=None)
train_set

Unnamed: 0,gpa,rip
0,1.419060e+10,1.541075e+07
1,1.435273e+10,1.403169e+14
2,1.412146e+10,1.541078e+07
3,1.412073e+10,1.541076e+07
4,1.412083e+10,1.541076e+07
...,...,...
1586345,1.298352e+10,2.047176e+07
1586346,1.298350e+10,2.047176e+07
1586347,1.298352e+10,2.047176e+07
1586348,1.432669e+10,2.047179e+07


## Save Processed Datasets

In [23]:
# Original 
train_set.to_csv("data/{}_train_set_original.csv".format(model_name), index=None)
val_set.to_csv("data/{}_val_set_original.csv".format(model_name), index=None)
test_set.to_csv("data/{}_test_set_original.csv".format(model_name), index=None)

# Processed 
processed_train_set.to_csv("data/{}_train_set.csv".format(model_name), index=None)
processed_val_set.to_csv("data/{}_val_set.csv".format(model_name), index=None)
processed_test_set.to_csv("data/{}_test_set.csv".format(model_name), index=None)

## Save Pipeline/Statics

In [24]:
with open("static/pipeline_gpa.pkl", 'wb') as f:
    dill.dump(GEMM_EX_gpa_train_pipeline, f)

with open("static/pipeline_rip.pkl", 'wb') as f:
    dill.dump(GEMM_EX_rip_train_pipeline, f)

In [25]:
np.savetxt("static/vocabulary_gpa.csv", np.array(list(GEMM_EX_gpa_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%d", delimiter="\n")

np.savetxt("static/vocabulary_rip.csv", np.array(list(GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%s", delimiter="\n")

In [26]:
dict(list(GEMM_EX_gpa_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-4096: 0,
 -1: 1,
 0: 2,
 -1638400: 3,
 -1896448: 4,
 -864256: 5,
 -8192: 6,
 -4060: 7,
 -13920476522: 8,
 13920476518: 9,
 -13920476518: 10,
 -13920476514: 11,
 13920476514: 12,
 13920476522: 13,
 333: 14,
 -430892: 15,
 -159744: 16,
 172032: 17,
 -131072: 18,
 -180224: 19}

In [27]:
dict(list(GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{'0': 0,
 '18446744073709551604': 1,
 '18446744073709551605': 2,
 '35': 3,
 '12': 4,
 '11': 5,
 '18446744073709551592': 6,
 '23': 7,
 '24': 8,
 '18446744073709551593': 9,
 '18446744073709551581': 10,
 '18446744073709551574': 11,
 '18446744073709551578': 12,
 '-1': 13,
 '18446744073709551573': 14,
 '18446744073709551392': 15,
 '18446744073709551506': 16,
 '18446744073709551582': 17,
 '333': 18,
 '18446744073709551585': 19}

In [28]:
GEMM_EX_gpa_train_pipeline["sparse_category_encoder"].vocab_size, GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocab_size

(33476, 1883)