In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "STREAM+GEMM"

## Load Dataset/Static Param List

In [3]:
STREAM_dataset = pd.concat([pd.read_csv("../로그 데이터/STREAM/STREAM/stream_4034_generic2_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
STREAM_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[336633.698810],04:01:00:854206,PF,251797631,18446744072449302655,4034
1,[336633.739463],04:01:00:894857,PF,1007005696,18446744072452043863,4034
2,[336633.867033],04:01:01:022428,PF,906555392,18446744072452043863,4034
3,[336633.901503],04:01:01:056898,PF,1914261504,18446744072452043863,4034
4,[336633.904886],04:01:01:060282,PF,259330048,18446744072452043863,4034
...,...,...,...,...,...,...
180309,[ 1806.225725],07:00:33:381360,PF,602140104,139790387006578,4034
180310,[ 1806.225827],07:00:33:381465,PF,597469140,18446744072443263295,4034
180311,[ 1806.240176],07:00:33:395810,PF,596315344,18446744072442219334,4034
180312,[ 1806.240198],07:00:33:395836,PF,597544452,18446744072441579461,4034


In [4]:
STREAM_dataset = STREAM_dataset[["gpa", "rip"]].dropna()       # rip for PCs
STREAM_dataset

Unnamed: 0,gpa,rip
0,251797631,18446744072449302655
1,1007005696,18446744072452043863
2,906555392,18446744072452043863
3,1914261504,18446744072452043863
4,259330048,18446744072452043863
...,...,...
180309,602140104,139790387006578
180310,597469140,18446744072443263295
180311,596315344,18446744072442219334
180312,597544452,18446744072441579461


In [5]:
GEMM_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM/gem_3214_generic_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
GEMM_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[601757.292075],06:00:51:525554,PF,14190604288,15410752,3214
1,[601757.297013],06:00:51:530492,PF,14352732392,140316942991300,3214
2,[601757.337978],06:00:51:571456,PF,14121463808,15410776,3214
3,[601757.368684],06:00:51:602162,PF,14120734720,15410764,3214
4,[601757.376568],06:00:51:610048,PF,14120828928,15410764,3214
...,...,...,...,...,...,...
400591,[ 1946.373117],07:24:00:468967,GPA,8620549134,18446744071888380942,3214
400592,[ 1946.373123],07:24:00:468972,GPA,22541025656,18446744071888380942,3214
400593,[ 1946.373157],07:24:00:469006,GPA,8620549134,18446744071888380942,3214
400594,[ 1946.373422],07:24:00:469266,GPA,8620548801,18446744071888380609,3214


In [6]:
GEMM_dataset = GEMM_dataset[["gpa", "rip"]].dropna()       # rip for PCs
GEMM_dataset

Unnamed: 0,gpa,rip
0,14190604288,15410752
1,14352732392,140316942991300
2,14121463808,15410776
3,14120734720,15410764
4,14120828928,15410764
...,...,...
400591,8620549134,18446744071888380942
400592,22541025656,18446744071888380942
400593,8620549134,18446744071888380942
400594,8620548801,18446744071888380609


In [7]:
original_dataset = pd.concat([STREAM_dataset, GEMM_dataset], ignore_index=True)
original_dataset

Unnamed: 0,gpa,rip
0,251797631,18446744072449302655
1,1007005696,18446744072452043863
2,906555392,18446744072452043863
3,1914261504,18446744072452043863
4,259330048,18446744072452043863
...,...,...
6489852,8620549134,18446744071888380942
6489853,22541025656,18446744071888380942
6489854,8620549134,18446744071888380942
6489855,8620548801,18446744071888380609


## Dataset Processing Functions

In [8]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

## Process Train/Validation Dataset

In [9]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [10]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [11]:
calculate_delta = CalculateDelta()

In [12]:
processed_train_val_gpa = calculate_delta.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([ -755208065,   100450304, -1007706112, ...,       -4096,
             -4096,       -4096], dtype=int64)

In [13]:
processed_train_val_rip = calculate_delta.transform(train_val_rip)
processed_train_val_rip

array([b'18446744073706810408', b'0', b'0', ..., b'0', b'0', b'0'],
      dtype='|S20')

In [14]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(-4096          1184303
  0              602811
 -1638400         67622
 -12288           51913
 -1896448         30291
                 ...   
  4841376             1
  113967044           1
 -1568910655          1
  676028400           1
 -7845624             1
 Length: 849143, dtype: int64,
 b'0'                       3829673
 b'18446744073709551604'      67421
 b'18446744073709551579'      42191
 b'18446744073709551605'      38328
 b'37'                        37913
                             ...   
 b'74469492552'                   1
 b'18446649989132988124'          1
 b'34746222'                      1
 b'18446743924312887999'          1
 b'93973016140542'                1
 Length: 67054, dtype: int64)

In [15]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(1       487890
 2       146278
 3        75690
 4        39983
 5        21499
          ...  
 2349         1
 1326         1
 304          1
 306          1
 2048         1
 Length: 615, dtype: int64,
 1       51034
 2        6241
 3        2358
 4        1389
 5         881
         ...  
 614         1
 742         1
 7973        1
 1830        1
 607         1
 Length: 666, dtype: int64)

## gpa : N / P ratio

In [16]:
gpa_data = []
for gpa_threshold in range(50):
    row = []
    
    row.append(gpa_threshold)
    if gpa_threshold == 0:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts())
        previous_gpa_p = gpa_p
        previous_accuracy = 0
    else:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values) + 1
    row.append(gpa_p)
    row.append(previous_gpa_p - gpa_p)
    previous_gpa_p = gpa_p
    row.append(processed_train_val_gpa.shape[0] / gpa_p)
    accuracy = sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values * pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].index) / (processed_train_val_gpa.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    gpa_data.append(row)

df_gpa_threshold = pd.DataFrame(data=gpa_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [17]:
df_gpa_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,849143,0,6.496405,0.0,0.0
1,1,361254,487889,15.270079,0.088444,0.088444
2,2,214976,146278,25.660432,0.141478,0.053034
3,3,139286,75690,39.604677,0.182641,0.041163
4,4,99303,39983,55.55096,0.211633,0.028992
5,5,77804,21499,70.900943,0.23112,0.019487
6,6,65202,12602,84.604414,0.244826,0.013707
7,7,57228,7974,96.392972,0.254945,0.010119
8,8,51686,5542,106.72865,0.262982,0.008037
9,9,47468,4218,116.212543,0.269864,0.006882


## rip : N / P ratio

In [18]:
rip_data = []
for rip_threshold in range(50):
    row = []
    
    row.append(rip_threshold)
    if rip_threshold == 0:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts())
        previous_rip_p = rip_p
        previous_accuracy = 0
    else:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts()) - sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values) + 1
    row.append(rip_p)
    row.append(previous_rip_p - rip_p)
    previous_rip_p = rip_p
    row.append(processed_train_val_rip.shape[0] / rip_p)
    accuracy = sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values * pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].index) / (processed_train_val_rip.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    rip_data.append(row)

df_rip_threshold = pd.DataFrame(data=rip_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [19]:
df_rip_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,67054,0,82.26768,0.0,0.0
1,1,16021,51033,344.32164,0.009251,0.009251
2,2,9780,6241,564.046728,0.011514,0.002263
3,3,7422,2358,743.246699,0.012796,0.001282
4,4,6033,1389,914.367147,0.013804,0.001007
5,5,5152,881,1070.725349,0.014602,0.000799
6,6,4500,652,1225.861556,0.015311,0.000709
7,7,3956,544,1394.433013,0.016002,0.00069
8,8,3515,441,1569.381792,0.016641,0.00064
9,9,3190,325,1729.271787,0.017171,0.00053


## Save Analysis 

In [21]:
df_gpa_threshold.to_csv(f"analysis/{model_name}_gpa_analysis.csv", index=False)
df_rip_threshold.to_csv(f"analysis/{model_name}_rip_analysis.csv", index=False)