In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

In [2]:
data = np.genfromtxt("../로그 데이터/SEG_SGEMM_result.txt", delimiter="\n", dtype=np.int64)
data

array([ 3196231680, 93292771632, 93293300344, ..., 92658792872,
       92658792864, 92654987192], dtype=int64)

In [3]:
delta = np.array([data[i+1] - data[i] for i in range(int(len(data))-1)])
delta, len(delta)

(array([90096539952,      528712,       73032, ...,    36097352,
                 -8,    -3805672], dtype=int64),
 237194)

In [4]:
data_counts = pd.Series(data=delta.T).value_counts()
data_counts

 0             27460
 4096           7851
 909517620      1849
-909517620      1848
 8192           1481
               ...  
-3833072           1
 349797616         1
-5403768464        1
-568288            1
 7770728296        1
Length: 60247, dtype: int64

In [5]:
def category_by_threshold(data_counts, threshold, sign="over"):
    if sign == "under":
        category = (data_counts < threshold)
    else:
        category = (data_counts > threshold)
    return category.index[category == True]

In [6]:
def tokenizer(data, category, oov=-1):
    data = data.copy()
    mask = np.isin(data, category)
    data[mask == False] = oov
    return data

In [7]:
threshold = 50
category_over_threshold = category_by_threshold(data_counts, threshold)
category_over_threshold

Int64Index([         0,       4096,  909517620, -909517620,       8192,
                    -8,      -4096,          8,      12288,       2416,
                 16384,         24,       3520,      -2744,        -12,
                -12288,      20480,         32,         64,          6,
                  4104,        -16,      24576,         28,       5280,
                 -2884,      28672,         56,      -8192,        240,
                  3904,       4092,      32768,       4128, -515913384,
                 40960,  515913384,       4080,       -240,     172032,
                 -8304,      90112,       3072,         16,      53248,
                    -4,         44,          2,        432,      81920,
                    48,     118784,       3936,     126976,       4112,
                     4,       3856,     -16384,      11776,      61440,
                  -432,        256,        -24,        -48,     -32768,
                131072,      69632,     135168,      94208,     

In [8]:
category_under_threshold = category_by_threshold(data_counts, threshold, sign='under')
category_under_threshold

Int64Index([     200704,          40,         -56,   -22654976,       -8216,
                 180224,         320,      253952,        3664,         128,
            ...
            23798772186,    -5226320,    53839952,    87047968,    64855276,
               -3833072,   349797616, -5403768464,     -568288,  7770728296],
           dtype='int64', length=60158)

In [9]:
np.sort(data_counts.index.to_numpy())

array([-93818644456, -93721537088, -93721111249, ...,  93757187984,
        93794971648,  93848074068], dtype=int64)

In [10]:
closest_category = []
limit = 500     # Too many category under threshold
for category in category_under_threshold[:limit].to_numpy():
    closest_category.append(category_over_threshold[np.argmin(np.abs(category_over_threshold - category))])

In [11]:
category_over_threshold_index_range = [np.where(delta == clst_ctg) for clst_ctg in closest_category]
category_over_threshold_index_range[0]

(array([ 37849,  38072,  38234,  44832,  45450,  55025,  55207,  55354,
         60981,  75330,  75517,  75664,  80809,  81335,  93987,  94175,
         94321,  99561, 100174, 110783, 110970, 111116, 116343, 116923,
        132392, 132609, 132769, 139183, 139755, 148301, 148481, 148626,
        149816, 150340, 168045, 168231, 168376, 174194, 185031, 185218,
        185365, 206024, 206211, 206356, 211615, 212219, 224501, 224686,
        224832, 230070, 230658], dtype=int64),)

In [12]:
category_over_threshold_range = []
range_value = 8
for category_over_threshold_index in category_over_threshold_index_range:
    tmp = []
    for idx in category_over_threshold_index[0]:
        tmp.append(np.r_[delta[idx-range_value:idx], delta[idx:idx+range_value]])
    category_over_threshold_range.append(np.array(tmp))

In [13]:
category_under_threshold_index_range = [np.where(delta == ctg_udr) for ctg_udr in category_under_threshold[:limit]]
category_under_threshold_index_range[0]

(array([ 37937,  39562,  39640,  39724,  45503,  55095,  56496,  56542,
         56621,  61039,  75405,  76797,  76857,  76937,  81397,  94063,
         95454,  95513,  95593, 110859, 112250, 112309, 112389, 116987,
        132478, 134086, 134163, 134247, 139814, 148370, 150397, 168120,
        169509, 169569, 169649, 185106, 186498, 186558, 186638, 206099,
        207541, 207600, 207680, 212284, 224575, 225965, 226025, 226105,
        230719], dtype=int64),)

In [14]:
category_under_threshold_range = []
for category_under_threshold_index in category_under_threshold_index_range:
    tmp = []
    for idx in category_under_threshold_index[0]:
        tmp.append(np.r_[delta[idx-range_value:idx], delta[idx:idx+range_value]])
    category_under_threshold_range.append(np.array(tmp))

In [15]:
category_over_threshold_range[0].shape, category_under_threshold_range[0].shape

((51, 16), (49, 16))

In [16]:
len(category_over_threshold_range), len(category_under_threshold_range)

(500, 500)

In [17]:
from sklearn.metrics import mean_squared_error

result = []
for i in range(len(category_over_threshold_range)):
    tmp = 0
    for co in category_over_threshold_range[i]:
        for cu in category_under_threshold_range[i]:
            tmp =+ mean_squared_error(co, cu)
    result.append(tmp/(category_over_threshold_range[i].shape[0] * category_under_threshold_range[i].shape[0]))

In [21]:
result[:10]

[308675193173.7431,
 612357242143363.1,
 -11036130921574.953,
 23095138175830.156,
 -155679915491370.25,
 324963002223.6062,
 -89016380415882.12,
 458418993673.54034,
 744343300062.7511,
 455279863322720.0]

In [22]:
np.savetxt("analysis_result.csv", np.array(result))

In [24]:
np.max(closest_category), np.min(closest_category)

(515913384, -909517620)

In [25]:
np.max(category_under_threshold[:limit]), np.min(category_under_threshold[:limit])

(469762048, -22851567616)

In [28]:
[(r - np.min(category_under_threshold[:limit])) / (np.max(closest_category) - np.min(category_under_threshold[:limit])) for r in result][:10]

[14.187526708152372,
 26206.508682342745,
 -471.30794089268556,
 989.3231428516474,
 -6661.268449250231,
 14.884555585585208,
 -3808.4348436301766,
 20.595739921198195,
 32.83173174202008,
 19484.45854691552]