In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

In [2]:
data = np.genfromtxt("../로그 데이터/SEG_SGEMM_result.txt", delimiter="\n", dtype=np.int64)
data

array([ 3196231680, 93292771632, 93293300344, ..., 92658792872,
       92658792864, 92654987192], dtype=int64)

In [3]:
delta = np.array([data[i+1] - data[i] for i in range(int(len(data))-1)])
delta, len(delta)

(array([90096539952,      528712,       73032, ...,    36097352,
                 -8,    -3805672], dtype=int64),
 237194)

In [4]:
data_counts = pd.Series(data=delta.T).value_counts()
data_counts

 0             27460
 4096           7851
 909517620      1849
-909517620      1848
 8192           1481
               ...  
-3833072           1
 349797616         1
-5403768464        1
-568288            1
 7770728296        1
Length: 60247, dtype: int64

In [5]:
def category_by_threshold(data_counts, threshold, sign="over"):
    if sign == "under":
        category = (data_counts < threshold)
    else:
        category = (data_counts > threshold)
    return category.index[category == True]

In [6]:
def tokenizer(data, category, oov=-1):
    data = data.copy()
    mask = np.isin(data, category)
    data[mask == False] = oov
    return data

In [7]:
threshold = 50
category_over_threshold = category_by_threshold(data_counts, threshold)
category_over_threshold

Int64Index([         0,       4096,  909517620, -909517620,       8192,
                    -8,      -4096,          8,      12288,       2416,
                 16384,         24,       3520,      -2744,        -12,
                -12288,      20480,         32,         64,          6,
                  4104,        -16,      24576,         28,       5280,
                 -2884,      28672,         56,      -8192,        240,
                  3904,       4092,      32768,       4128, -515913384,
                 40960,  515913384,       4080,       -240,     172032,
                 -8304,      90112,       3072,         16,      53248,
                    -4,         44,          2,        432,      81920,
                    48,     118784,       3936,     126976,       4112,
                     4,       3856,     -16384,      11776,      61440,
                  -432,        256,        -24,        -48,     -32768,
                131072,      69632,     135168,      94208,     

In [12]:
category_under_threshold = category_by_threshold(data_counts, threshold, sign='under')
category_under_threshold

Int64Index([     200704,          40,         -56,   -22654976,       -8216,
                 180224,         320,      253952,        3664,         128,
            ...
            23798772186,    -5226320,    53839952,    87047968,    64855276,
               -3833072,   349797616, -5403768464,     -568288,  7770728296],
           dtype='int64', length=60158)

In [11]:
np.sort(data_counts.index.to_numpy())

array([-93818644456, -93721537088, -93721111249, ...,  93757187984,
        93794971648,  93848074068], dtype=int64)

In [33]:
closest_category = []
for category in category_under_threshold.to_numpy():
    closest_category.append(category_over_threshold[np.argmin(np.abs(category_over_threshold - category))])

In [87]:
category_over_threshold_index_range = np.where(delta == closest_category[0])
category_over_threshold_index_range[0]

array([ 37849,  38072,  38234,  44832,  45450,  55025,  55207,  55354,
        60981,  75330,  75517,  75664,  80809,  81335,  93987,  94175,
        94321,  99561, 100174, 110783, 110970, 111116, 116343, 116923,
       132392, 132609, 132769, 139183, 139755, 148301, 148481, 148626,
       149816, 150340, 168045, 168231, 168376, 174194, 185031, 185218,
       185365, 206024, 206211, 206356, 211615, 212219, 224501, 224686,
       224832, 230070, 230658], dtype=int64)

In [90]:
category_over_threshold_range = []
range_value = 8
for idx in category_over_threshold_index_range[0]:
    category_over_threshold_range.append(np.r_[delta[idx-range_value:idx], delta[idx:idx+range_value]])
category_over_threshold_range = np.array(category_over_threshold_range)

(51, 16)

In [54]:
category_under_threshold_index_range = np.where(delta == category_under_threshold[0])
category_under_threshold_index_range[0]

array([ 37937,  39562,  39640,  39724,  45503,  55095,  56496,  56542,
        56621,  61039,  75405,  76797,  76857,  76937,  81397,  94063,
        95454,  95513,  95593, 110859, 112250, 112309, 112389, 116987,
       132478, 134086, 134163, 134247, 139814, 148370, 150397, 168120,
       169509, 169569, 169649, 185106, 186498, 186558, 186638, 206099,
       207541, 207600, 207680, 212284, 224575, 225965, 226025, 226105,
       230719], dtype=int64)

In [91]:
category_under_threshold_range = []
range_value = 8
for idx in category_under_threshold_index_range[0]:
    category_under_threshold_range.append(np.r_[delta[idx-range_value:idx], delta[idx:idx+range_value]])
category_under_threshold_range = np.array(category_under_threshold_range)

In [93]:
category_over_threshold_range.shape, category_under_threshold_range.shape

((51, 16), (49, 16))

In [96]:
from sklearn.metrics import mean_squared_error

result = 0
for co in category_over_threshold_range:
    for cu in category_under_threshold_range:
        result =+ mean_squared_error(co, cu)

In [98]:
result // (category_over_threshold_range.shape[0] * category_under_threshold_range.shape[0])

308675193173.0