In [74]:
import numpy as np
import pandas as pd
import json
import dill
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [75]:
model_name = "NU_AR"

In [76]:
os.mkdir("data/")
os.mkdir("static/")

FileExistsError: [WinError 183] 파일이 이미 있으므로 만들 수 없습니다: 'data/'

## Load Dataset/Static Param List

In [77]:
original_dataset = np.genfromtxt("../로그 데이터/NU-MineBench.csv", delimiter="\n", dtype=np.int64)
original_dataset

array([          -1, 105950216192, 105943924736, ..., 103563653120,
       103565225984, 103560867840], dtype=int64)

## Segregate Initial/Main/Terminal Stages

In [78]:
initial_stage_index = 793
terminal_stage_index = 10502

In [79]:
dataset = original_dataset[initial_stage_index:terminal_stage_index]
dataset

array([104287174656, 104289271808, 104282980352, ..., 103984320512,
       103984324608, 103984328704], dtype=int64)

## Dataset Processing Functions

In [80]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.array([X[i+1] - X[i] for i in range(int(len(X))-1)])

In [81]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
        X[noise_index] = self.oov_token
        return X

In [82]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}

        if -1 not in self.word_index:
            self.vocabulary[-1] = self.vocab_size
            self.vocab_size += 1
        
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for i in range(len(X)):
            if X[i] in self.word_index:
                X_transformed.append(self.vocabulary[X[i]])
            else:
                X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [83]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(dataset, test_size=0.15, shuffle=False)
train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [84]:
train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [85]:
processed_train_val_set = train_pipeline.fit_transform(train_val_set.copy())
processed_train_val_set

array([2, 1, 0, ..., 0, 0, 0])

In [86]:
processed_train_set, processed_val_set = train_test_split(processed_train_val_set, test_size=0.2, shuffle=False)

## Save Processed Datasets/Statics

In [87]:
np.savetxt("data/{}_train_set_original.csv".format(model_name), train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set_original.csv".format(model_name), test_set, fmt="%d", delimiter="\n")

np.savetxt("data/{}_train_set.csv".format(model_name), processed_train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set.csv".format(model_name), processed_val_set, fmt="%d", delimiter="\n")

In [88]:
validation_set = np.r_[train_set[-1], val_set]  # As one data point is lost during CalculateDelta process
np.savetxt("data/{}_val_set_original.csv".format(model_name), validation_set, fmt="%d", delimiter="\n")

In [89]:
test_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('sparse_category_encoder', train_pipeline["sparse_category_encoder"])
])

In [90]:
with open("static/pipeline.pkl", 'wb') as f:
    dill.dump(test_pipeline, f)

In [91]:
dict(list(test_pipeline["sparse_category_encoder"].vocabulary.items()))

{4096: 0, -6291456: 1, 2097152: 2, 0: 3, -8384512: 4, -1: 5}

In [92]:
test_pipeline["sparse_category_encoder"].vocab_size

6