In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_EX"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM/gem_3214_generic_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[601757.292075],06:00:51:525554,PF,14190604288,15410752,3214
1,[601757.297013],06:00:51:530492,PF,14352732392,140316942991300,3214
2,[601757.337978],06:00:51:571456,PF,14121463808,15410776,3214
3,[601757.368684],06:00:51:602162,PF,14120734720,15410764,3214
4,[601757.376568],06:00:51:610048,PF,14120828928,15410764,3214
...,...,...,...,...,...,...
400591,[ 1946.373117],07:24:00:468967,GPA,8620549134,18446744071888380942,3214
400592,[ 1946.373123],07:24:00:468972,GPA,22541025656,18446744071888380942,3214
400593,[ 1946.373157],07:24:00:469006,GPA,8620549134,18446744071888380942,3214
400594,[ 1946.373422],07:24:00:469266,GPA,8620548801,18446744071888380609,3214


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,14190604288,15410752
1,14352732392,140316942991300
2,14121463808,15410776
3,14120734720,15410764
4,14120828928,15410764
...,...,...
400591,8620549134,18446744071888380942
400592,22541025656,18446744071888380942
400593,8620549134,18446744071888380942
400594,8620548801,18446744071888380609


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X[:-1] - X[1:]

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

In [6]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):        
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
    
    def fit(self, X, y=None):
        ## NOTE : If X is unsigned type, change the value of {oov_token} to maximum value of its type.
        self.dtype = X.dtype
        if self.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            self.oov_token = str(self.oov_token)
            X.astype(np.string_)

        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
    
        X[noise_index] = self.oov_token    ## NOTE : Due to Unknown Bug, move this part to transform()

        return self
    
    def transform(self, X, y=None):
        if X.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X.astype(np.string_)
        return X

    def inverse_transform(self, X, y=None):
        return X

In [7]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        if self.word_index.dtype == np.string_:     # As np.string_ type is byte type, not str(), need to be decoded.
            self.vocabulary = {X_counts.index[i].decode():i for i in range(self.vocab_size)}
            self.oov_token = str(self.oov_token)
        else:
            self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for i in range(len(X)):
            if X[i] in self.word_index:
                X_transformed.append(self.vocabulary[X[i]])
            else:
                X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [8]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [9]:
train_val_gpa = train_val_set["gpa"].values.astype(np.float64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.float64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [10]:
GEMM_EX_gpa_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

GEMM_EX_rip_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [11]:
processed_train_val_gpa = GEMM_EX_gpa_train_pipeline.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([  1,   1, 545, ...,   0,   0,   0])

In [12]:
processed_train_val_rip = GEMM_EX_rip_train_pipeline.fit_transform(train_val_rip)
processed_train_val_rip

array([13, 13,  4, ...,  0,  0,  0])

In [13]:
pd.Series(processed_train_val_rip).value_counts()

0       1189566
1         99667
2         54664
3         48114
4         34376
         ...   
1559          3
1763          3
1671          3
1762          3
1615          3
Length: 1883, dtype: int64

In [14]:
processed_train_gpa, processed_val_gpa = train_test_split(processed_train_val_gpa, test_size=0.2, shuffle=False)
processed_train_rip, processed_val_rip = train_test_split(processed_train_val_rip, test_size=0.2, shuffle=False)

In [15]:
processed_train_gpa.shape, processed_val_gpa.shape, processed_train_rip.shape, processed_val_rip.shape  # check 

((1586349,), (396588,), (1586349,), (396588,))

In [16]:
train_gpa = train_val_gpa[:processed_train_gpa.shape[0]+1]
val_gpa = train_val_gpa[processed_train_gpa.shape[0]:]

train_rip = train_val_rip[:processed_train_rip.shape[0]+1]
val_rip = train_val_rip[processed_train_rip.shape[0]:]

In [17]:
train_gpa.shape, val_gpa.shape, train_rip.shape, val_rip.shape

((1586350,), (396589,), (1586350,), (396589,))

## Process Test Dataset

In [18]:
processed_test_gpa = GEMM_EX_gpa_train_pipeline.transform(test_gpa)
processed_test_gpa

array([ 0,  1,  1, ..., 13, 14,  1])

In [19]:
processed_test_rip = GEMM_EX_rip_train_pipeline.transform(test_rip)
processed_test_rip

KeyError: -1

In [25]:
dict(list(GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocabulary.items())[:10])

{0: 0,
 18446744073709551604: 1,
 18446744073709551605: 2,
 35: 3,
 12: 4,
 11: 5,
 18446744073709551592: 6,
 23: 7,
 24: 8,
 18446744073709551593: 9}

In [37]:
a = np.array([str(-1)])
a.astype(np.string_)
type(a[0])

numpy.str_

In [38]:
GEMM_EX_rip_train_pipeline["sparse_category_encoder"].vocabulary[a[0]]

KeyError: '-1'

In [None]:
pd.Series(processed_test_rip).value_counts()

## Check Dataset/Processed Dataset Shape  
Note that original train/val set have size of {processed train/val set + 1}, due to Delta calculation.

In [13]:
train_set.shape, val_set.shape, processed_train_set.shape, processed_val_set.shape

((161292,), (40324,), (161291,), (40323,))

In [14]:
test_set.shape, processed_test_set.shape

((35580,), (35579,))

## Save Processed Datasets

In [15]:
# Original 
np.savetxt("data/{}_train_set_original.csv".format(model_name), train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set_original.csv".format(model_name), val_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set_original.csv".format(model_name), test_set, fmt="%d", delimiter="\n")

# Processed 
np.savetxt("data/{}_train_set.csv".format(model_name), processed_train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set.csv".format(model_name), processed_val_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set.csv".format(model_name), processed_test_set, fmt="%d", delimiter="\n")

## Save Pipeline/Statics

In [20]:
with open("static/pipeline.pkl", 'wb') as f:
    dill.dump(SEG_train_pipeline, f)

In [17]:
np.savetxt("static/vocabulary.csv", np.array(list(SEG_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%d", delimiter="\n")

In [18]:
dict(list(SEG_train_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-1: 0,
 0: 1,
 -4096: 2,
 -909517620: 3,
 909517620: 4,
 -8192: 5,
 8: 6,
 4096: 7,
 -8: 8,
 -12288: 9,
 -2416: 10,
 -16384: 11,
 -24: 12,
 -3520: 13,
 12: 14,
 2744: 15,
 -6: 16,
 -64: 17,
 -32: 18,
 -20480: 19}

In [19]:
SEG_train_pipeline["sparse_category_encoder"].vocab_size

16293