In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_STREAM_VMID=17926"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM_STREAM/GEMM_STREAM/gemm_stream_generic_generic2_{}.csv".format(i), dtype=np.object) for i in range(1, 10)], axis=0)
original_dataset = original_dataset[original_dataset["vmid"] == '17926']
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
2149,[861368.920026],05:46:36:062305,GPA,17569096718,18446744072500749326,17926
2150,[861368.920040],05:46:36:062325,GPA,22541336944,18446744072500749326,17926
2151,[861368.920062],05:46:36:062348,GPA,17569096718,18446744072500749326,17926
2152,[861368.920068],05:46:36:062354,GPA,22541336948,18446744072500749326,17926
2153,[861368.920086],05:46:36:062372,GPA,17569096718,18446744072500749326,17926
...,...,...,...,...,...,...
407242,[957446.819051],08:27:53:959834,PF,20453990400,18446744072506569815,17926
407243,[957446.865342],08:27:54:006124,PF,20453994496,18446744072506569815,17926
407244,[957446.865354],08:27:54:006138,PF,20453998592,18446744072506569815,17926
407245,[957446.865359],08:27:54:006143,PF,20454002688,18446744072506569815,17926


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
2149,17569096718,18446744072500749326
2150,22541336944,18446744072500749326
2151,17569096718,18446744072500749326
2152,22541336948,18446744072500749326
2153,17569096718,18446744072500749326
...,...,...
407242,20453990400,18446744072506569815
407243,20453994496,18446744072506569815
407244,20453998592,18446744072506569815
407245,20454002688,18446744072506569815


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

## Process Train/Validation Dataset

In [6]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [7]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [8]:
calculate_delta = CalculateDelta()

In [9]:
processed_train_val_gpa = calculate_delta.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([-4972240226,  4972240226, -4972240230, ...,           0,
                 0,           0], dtype=int64)

In [10]:
processed_train_val_rip = calculate_delta.transform(train_val_rip)
processed_train_val_rip

array([b'0', b'0', b'0', ..., b'0', b'0', b'0'], dtype='|S20')

In [11]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(-4096          2166072
  0              770606
 -1638400        175768
 -1896448         75392
 -8192            38496
                 ...   
 -591212544           1
 -6531026944          1
  4322430976          1
  3642871336          1
 -2543742976          1
 Length: 762652, dtype: int64,
 b'0'                       4209818
 b'18446744073709551604'     247275
 b'18446744073709551605'     134216
 b'35'                       117557
 b'12'                        90448
                             ...   
 b'1227167289'                    1
 b'18446603835989313538'          1
 b'18446744073702595384'          1
 b'18446604049009101086'          1
 b'18446603644298361661'          1
 Length: 32600, dtype: int64)

In [12]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(1       601911
 2        69302
 3        27680
 4        15491
 5         9773
          ...  
 849          1
 2896         1
 695          1
 4435         1
 646          1
 Length: 863, dtype: int64,
 1       23330
 2        3748
 3        1428
 4         846
 5         548
         ...  
 313         1
 425         1
 457         1
 6698        1
 423         1
 Length: 478, dtype: int64)

## gpa : N / P ratio

In [13]:
gpa_data = []
for gpa_threshold in range(50):
    row = []
    
    row.append(gpa_threshold)
    if gpa_threshold == 0:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts())
        previous_gpa_p = gpa_p
        previous_accuracy = 0
        previous_ratio = 0
    else:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values) + 1
    row.append(gpa_p)
    row.append(previous_gpa_p - gpa_p)
    previous_gpa_p = gpa_p
    ratio = processed_train_val_gpa.shape[0] / gpa_p
    row.append(ratio)
    row.append(ratio - previous_ratio)
    previous_ratio = ratio
    accuracy = sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values * pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].index) / (processed_train_val_gpa.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    gpa_data.append(row)

df_gpa_threshold = pd.DataFrame(data=gpa_data, columns=["threshold", "p", "p'", "n/p ratio", "ratio'", "upper limit for train/val accuracy", "accuracy'"])

In [14]:
df_gpa_threshold

Unnamed: 0,threshold,p,p',n/p ratio,ratio',upper limit for train/val accuracy,accuracy'
0,0,762652,0,8.122108,8.122108,0.0,0.0
1,1,160742,601910,38.535927,30.413819,0.097171,0.097171
2,2,91440,69302,67.742148,29.206221,0.119547,0.022376
3,3,63760,27680,97.15091,29.408762,0.132953,0.013406
4,4,48269,15491,128.329611,31.178701,0.142956,0.010003
5,5,38496,9773,160.908718,32.579107,0.150845,0.007889
6,6,31609,6887,195.967667,35.05895,0.157516,0.006671
7,7,26752,4857,231.546875,35.579208,0.163004,0.005489
8,8,23047,3705,268.769992,37.223117,0.167789,0.004785
9,9,20213,2834,306.453372,37.68338,0.171907,0.004118


## rip : N / P ratio

In [15]:
rip_data = []
for rip_threshold in range(50):
    row = []
    
    row.append(rip_threshold)
    if rip_threshold == 0:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts())
        previous_rip_p = rip_p
        previous_accuracy = 0
        previous_ratio = 0
    else:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts()) - sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values) + 1
    row.append(rip_p)
    row.append(previous_rip_p - rip_p)
    previous_rip_p = rip_p
    ratio = processed_train_val_rip.shape[0] / rip_p
    row.append(ratio)
    row.append(ratio - previous_ratio)
    previous_ratio = ratio
    accuracy = sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values * pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].index) / (processed_train_val_rip.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    rip_data.append(row)

df_rip_threshold = pd.DataFrame(data=rip_data, columns=["threshold", "p", "p'", "n/p ratio", "ratio'", "upper limit for train/val accuracy", "accuracy'"])

In [16]:
df_rip_threshold

Unnamed: 0,threshold,p,p',n/p ratio,ratio',upper limit for train/val accuracy,accuracy'
0,0,32600,0,190.010491,190.010491,0.0,0.0
1,1,9271,23329,668.141732,478.131241,0.003766,0.003766
2,2,5523,3748,1121.553866,453.412133,0.004976,0.00121
3,3,4095,1428,1512.659829,391.105963,0.005668,0.000692
4,4,3249,846,1906.538012,393.878183,0.006214,0.000546
5,5,2701,548,2293.351351,386.81334,0.006657,0.000442
6,6,2317,384,2673.432024,380.080673,0.007029,0.000372
7,7,2065,252,2999.681356,326.249332,0.007313,0.000285
8,8,1840,225,3366.490217,366.808861,0.007604,0.000291
9,9,1684,156,3678.350356,311.860139,0.007831,0.000227


## Save Analysis 

In [17]:
df_gpa_threshold.to_csv("analysis/{}_gpa_analysis.csv".format(model_name), index=False)
df_rip_threshold.to_csv("analysis/{}_rip_analysis.csv".format(model_name), index=False)