In [2]:
import numpy as np
import pandas as pd
import json
import dill
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [3]:
model_name = "SEG"

In [4]:
os.mkdir("data/")

## Load Dataset/Static Param List

In [5]:
original_dataset = np.genfromtxt("../로그 데이터/SEG_SGEMM_result.txt", delimiter="\n", dtype=np.int64)
original_dataset

array([ 3196231680, 93292771632, 93293300344, ..., 92658792872,
       92658792864, 92654987192], dtype=int64)

## Dataset Processing Functions

In [6]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.array([X[i+1] - X[i] for i in range(int(len(X))-1)])

In [7]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
        X[noise_index] = self.oov_token
        return X

In [8]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for i in range(len(X)):
            if X[i] in self.word_index:
                X_transformed.append(self.vocabulary[X[i]])
            else:
                X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [9]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [10]:
SEG_train_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [11]:
processed_train_val_set = SEG_train_pipeline.fit_transform(train_val_set.copy())
processed_train_val_set

array([   0,    0,    0, ...,  841,  244, 1121])

In [12]:
processed_train_set, processed_val_set = train_test_split(processed_train_val_set, test_size=0.2, shuffle=False)

In [13]:
x_train, y_train = generate_timeseries(train_set, 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"])
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().batch(static_params["BATCH_SIZE"]).shuffle(static_params["BUFFER_SIZE"]).repeat()
x_train.shape, y_train.shape

NameError: name 'generate_timeseries' is not defined

In [14]:
x_val, y_val = generate_timeseries(val_set, 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"])
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_data = val_data.cache().batch(static_params["BATCH_SIZE"]).shuffle(static_params["BUFFER_SIZE"]).repeat()
x_val.shape, y_val.shape

NameError: name 'generate_timeseries' is not defined

## Save Processed Datasets/Statics

In [15]:
np.savetxt("data/{}_train_set_original.csv".format(model_name), train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set_original.csv".format(model_name), test_set, fmt="%d", delimiter="\n")

np.savetxt("data/{}_train_set.csv".format(model_name), processed_train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set.csv".format(model_name), processed_val_set, fmt="%d", delimiter="\n")

In [16]:
validation_set = np.r_[train_set[-1], val_set]  # As one data point is lost during CalculateDelta process
np.savetxt("data/{}_val_set_original.csv".format(model_name), validation_set, fmt="%d", delimiter="\n")

In [17]:
SEG_test_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('sparse_category_encoder', SEG_train_pipeline["sparse_category_encoder"])
])

In [18]:
with open("static/test_pipeline.pkl", 'wb') as f:
    dill.dump(SEG_test_pipeline, f)

In [19]:
dict(list(SEG_test_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-1: 0,
 0: 1,
 4096: 2,
 909517620: 3,
 -909517620: 4,
 8192: 5,
 -8: 6,
 -4096: 7,
 8: 8,
 12288: 9,
 2416: 10,
 16384: 11,
 24: 12,
 3520: 13,
 -12: 14,
 -2744: 15,
 6: 16,
 64: 17,
 32: 18,
 20480: 19}