In [5]:
import numpy as np
import pandas as pd
from tensorflow import keras
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime
from sklearn.base import BaseEstimator, TransformerMixin

plt.rcParams["figure.figsize"] = (20, 5)

In [6]:
original = np.genfromtxt("../로그 데이터/SEG_SGEMM_result.txt", delimiter="\n", dtype=np.int64)
original

array([ 3196231680, 93292771632, 93293300344, ..., 92658792872,
       92658792864, 92654987192], dtype=int64)

In [7]:
from sklearn.model_selection import train_test_split

data, test_set = train_test_split(original, test_size=0.25, shuffle=False)

In [39]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.array([X[i+1] - X[i] for i in range(int(len(X))-1)])

In [41]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
        X[noise_index] = self.oov_token
        return X

In [42]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        
        return self
    
    def transform(self, X, y=None):
        return np.array([self.vocabulary[X[i]] for i in range(len(X))])

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

In [43]:
from sklearn.pipeline import Pipeline

SEG_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('noise_tokenizer', NoiseTokenizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

train_set = SEG_pipeline.fit_transform(data)
train_set

array([   0,    0,    0, ..., 1918, 1235, 1227])

In [51]:
train_set[28]

48

In [57]:
dict(list(SEG_pipeline["sparse_category_encoder"].vocabulary.items())[:20])

{-1: 0,
 0: 1,
 4096: 2,
 909517620: 3,
 -909517620: 4,
 8192: 5,
 -8: 6,
 -4096: 7,
 8: 8,
 12288: 9,
 2416: 10,
 16384: 11,
 24: 12,
 3520: 13,
 -12: 14,
 64: 15,
 6: 16,
 -2744: 17,
 32: 18,
 20480: 19}