In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_STREAM_VMID=4034"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM_STREAM/GEMM_STREAM/gemm_stream_generic_generic2_{}.csv".format(i), dtype=np.object) for i in range(1, 10)], axis=0)
original_dataset = original_dataset[original_dataset["vmid"] == '4034']
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[861367.505406],05:46:34:647695,PF,988827648,18446744072452043863,4034
1,[861367.505455],05:46:34:647744,PF,705122304,18446744072452043863,4034
2,[861367.505459],05:46:34:647748,PF,705122304,18446744072452043863,4034
3,[861367.505464],05:46:34:647754,PF,1227321344,18446744072452043863,4034
4,[861367.505468],05:46:34:647757,PF,1227321344,18446744072452043863,4034
...,...,...,...,...,...,...
407190,[957446.648224],08:27:53:789007,PF,253842912,18446744072451347936,4034
407191,[957446.648252],08:27:53:789035,GPA,878240226,18446744072637686242,4034
407192,[957446.650168],08:27:53:790949,GPA,878245255,18446744072637683079,4034
407193,[957446.650178],08:27:53:790962,GPA,878245319,18446744072637683143,4034


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,988827648,18446744072452043863
1,705122304,18446744072452043863
2,705122304,18446744072452043863
3,1227321344,18446744072452043863
4,1227321344,18446744072452043863
...,...,...
407190,253842912,18446744072451347936
407191,878240226,18446744072637686242
407192,878245255,18446744072637683079
407193,878245319,18446744072637683143


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

## Process Train/Validation Dataset

In [6]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [7]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [8]:
calculate_delta = CalculateDelta()

In [9]:
processed_train_val_gpa = calculate_delta.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([ 283705344,          0, -522199040, ...,          0,  172953600,
                0], dtype=int64)

In [10]:
processed_train_val_rip = calculate_delta.transform(train_val_rip)
processed_train_val_rip

array([b'0', b'0', b'0', ..., b'0', b'0', b'0'], dtype='|S20')

In [11]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

( 0             650130
 -4096          154000
 -643931498      10662
  643931494      10661
 -643931494      10661
                 ...  
  1683050496         1
 -1804783616         1
  1266888704         1
  162660352          1
  675282944          1
 Length: 360547, dtype: int64,
 b'0'                       1303291
 b'94268222548486'            11004
 b'18446649805487003130'      10625
 b'333'                       10558
 b'18446744073709551579'       8329
                             ...   
 b'479374'                        1
 b'18446698303869782508'          1
 b'18446744073706208576'          1
 b'18446604282210924666'          1
 b'720'                           1
 Length: 26252, dtype: int64)

In [12]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(1       246302
 2        57660
 3        22645
 4        12579
 5         7615
          ...  
 622          1
 110          1
 106          1
 104          1
 2046         1
 Length: 153, dtype: int64,
 1        19901
 2         2545
 3          982
 4          580
 5          332
          ...  
 87           1
 71           1
 283          1
 10558        1
 1263         1
 Length: 237, dtype: int64)

## gpa : N / P ratio

In [13]:
gpa_data = []
for gpa_threshold in range(50):
    row = []
    
    row.append(gpa_threshold)
    if gpa_threshold == 0:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts())
        previous_gpa_p = gpa_p
        previous_accuracy = 0
    else:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values) + 1
    row.append(gpa_p)
    row.append(previous_gpa_p - gpa_p)
    previous_gpa_p = gpa_p
    row.append(processed_train_val_gpa.shape[0] / gpa_p)
    accuracy = sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values * pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].index) / (processed_train_val_gpa.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    gpa_data.append(row)

df_gpa_threshold = pd.DataFrame(data=gpa_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [14]:
df_gpa_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,360547,0,4.347611,0.0,0.0
1,1,114246,246301,13.72055,0.157129,0.157129
2,2,56586,57660,27.701516,0.230697,0.073569
3,3,33941,22645,46.183613,0.274036,0.043339
4,4,21362,12579,73.378803,0.306136,0.032099
5,5,13747,7615,114.026188,0.330426,0.02429
6,6,8976,4771,174.634358,0.348688,0.018262
7,7,6031,2945,259.910131,0.361839,0.013151
8,8,4135,1896,379.085369,0.371515,0.009676
9,9,3002,1133,522.157895,0.378021,0.006505


## rip : N / P ratio

In [15]:
rip_data = []
for rip_threshold in range(50):
    row = []
    
    row.append(rip_threshold)
    if rip_threshold == 0:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts())
        previous_rip_p = rip_p
        previous_accuracy = 0
    else:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts()) - sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values) + 1
    row.append(rip_p)
    row.append(previous_rip_p - rip_p)
    previous_rip_p = rip_p
    row.append(processed_train_val_rip.shape[0] / rip_p)
    accuracy = sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values * pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].index) / (processed_train_val_rip.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    rip_data.append(row)

df_rip_threshold = pd.DataFrame(data=rip_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [16]:
df_rip_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,26252,0,59.710422,0.0,0.0
1,1,6352,19900,246.775504,0.012696,0.012696
2,2,3807,2545,411.746257,0.015943,0.003247
3,3,2825,982,554.873628,0.017822,0.001879
4,4,2245,580,698.226281,0.019302,0.00148
5,5,1913,332,819.403032,0.020361,0.001059
6,6,1648,265,951.163835,0.021376,0.001014
7,7,1469,179,1067.06467,0.022175,0.000799
8,8,1314,155,1192.936073,0.022966,0.000791
9,9,1186,128,1321.684654,0.023701,0.000735


## Save Analysis 

In [17]:
df_gpa_threshold.to_csv("analysis/{}_gpa_analysis.csv".format(model_name), index=False)
df_rip_threshold.to_csv("analysis/{}_rip_analysis.csv".format(model_name), index=False)