In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "GEMM_STREAM"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/GEMM_STREAM/GEMM_STREAM/gemm_stream_generic_generic2_{}.csv".format(i), dtype=np.object) for i in range(1, 10)], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[861367.505406],05:46:34:647695,PF,988827648,18446744072452043863,4034
1,[861367.505455],05:46:34:647744,PF,705122304,18446744072452043863,4034
2,[861367.505459],05:46:34:647748,PF,705122304,18446744072452043863,4034
3,[861367.505464],05:46:34:647754,PF,1227321344,18446744072452043863,4034
4,[861367.505468],05:46:34:647757,PF,1227321344,18446744072452043863,4034
...,...,...,...,...,...,...
407242,[957446.819051],08:27:53:959834,PF,20453990400,18446744072506569815,17926
407243,[957446.865342],08:27:54:006124,PF,20453994496,18446744072506569815,17926
407244,[957446.865354],08:27:54:006138,PF,20453998592,18446744072506569815,17926
407245,[957446.865359],08:27:54:006143,PF,20454002688,18446744072506569815,17926


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,988827648,18446744072452043863
1,705122304,18446744072452043863
2,705122304,18446744072452043863
3,1227321344,18446744072452043863
4,1227321344,18446744072452043863
...,...,...
407242,20453990400,18446744072506569815
407243,20453994496,18446744072506569815
407244,20453998592,18446744072506569815
407245,20454002688,18446744072506569815


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

## Process Train/Validation Dataset

In [6]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [7]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [8]:
calculate_delta = CalculateDelta()

In [9]:
processed_train_val_gpa = calculate_delta.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([ 283705344,          0, -522199040, ...,  202375168,          0,
       -143433728], dtype=int64)

In [10]:
processed_train_val_rip = calculate_delta.transform(train_val_rip)
processed_train_val_rip

array([b'0', b'0', b'0', ..., b'0', b'0', b'0'], dtype='|S20')

In [11]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(-4096          2279684
  0             1414252
 -1638400        169636
 -1896448         72617
 -8192            41617
                 ...   
 -3494043644          1
 -490897392           1
 -7745351680          1
  4063200             1
  1425715200          1
 Length: 1124931, dtype: int64,
 b'0'                       5424746
 b'18446744073709551604'     241034
 b'18446744073709551605'     131801
 b'35'                       114156
 b'12'                        85491
                             ...   
 b'18446744073574749381'          1
 b'45586976695614'                1
 b'18446744073709196551'          1
 b'18446744073709542812'          1
 b'865040'                        1
 Length: 63512, dtype: int64)

In [12]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(1        865165
 2        114501
 3         44038
 4         23992
 5         15488
           ...  
 310           1
 13616         1
 6452          1
 311           1
 1019          1
 Length: 869, dtype: int64,
 1          46006
 2           6986
 3           2698
 4           1480
 5            983
            ...  
 1074           1
 5424746        1
 1650           1
 2002           1
 303            1
 Length: 608, dtype: int64)

## gpa : N / P ratio

In [13]:
gpa_data = []
for gpa_threshold in range(30):
    row = []
    
    row.append(gpa_threshold)
    if gpa_threshold == 0:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts())
        previous_gpa_p = gpa_p
        previous_accuracy = 0
    else:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values) + 1
    row.append(gpa_p)
    row.append(previous_gpa_p - gpa_p)
    previous_gpa_p = gpa_p
    row.append(processed_train_val_gpa.shape[0] / gpa_p)
    accuracy = sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values * pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].index) / (processed_train_val_gpa.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    gpa_data.append(row)

df_gpa_threshold = pd.DataFrame(data=gpa_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [14]:
df_gpa_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,1124931,0,6.899855,0.0,0.0
1,1,259767,865164,29.880089,0.111464,0.111464
2,2,145266,114501,53.432056,0.140967,0.029503
3,3,101228,44038,76.677016,0.157988,0.017021
4,4,77236,23992,100.495378,0.170352,0.012364
5,5,61748,15488,125.702225,0.180329,0.009977
6,6,50455,11293,153.837301,0.189059,0.00873
7,7,41906,8549,185.220756,0.196769,0.00771
8,8,35364,6542,219.484815,0.203511,0.006743
9,9,30289,5075,256.260061,0.209396,0.005885


## rip : N / P ratio

In [15]:
rip_data = []
for rip_threshold in range(30):
    row = []
    
    row.append(rip_threshold)
    if rip_threshold == 0:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts())
        previous_rip_p = rip_p
        previous_accuracy = 0
    else:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts()) - sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values) + 1
    row.append(rip_p)
    row.append(previous_rip_p - rip_p)
    previous_rip_p = rip_p
    row.append(processed_train_val_rip.shape[0] / rip_p)
    accuracy = sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values * pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].index) / (processed_train_val_rip.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    rip_data.append(row)

df_rip_threshold = pd.DataFrame(data=rip_data, columns=["threshold", "p", "p'", "n/p ratio", "upper limit for train/val accuracy", "accuracy'"])

In [16]:
df_rip_threshold

Unnamed: 0,threshold,p,p',n/p ratio,upper limit for train/val accuracy,accuracy'
0,0,63512,0,122.210937,0.0,0.0
1,1,17507,46005,443.357571,0.005927,0.005927
2,2,10521,6986,737.749358,0.007727,0.0018
3,3,7823,2698,992.184712,0.00877,0.001043
4,4,6343,1480,1223.689264,0.009533,0.000763
5,5,5360,983,1448.108396,0.010166,0.000633
6,6,4661,699,1665.278052,0.010706,0.00054
7,7,4174,487,1859.57379,0.011146,0.000439
8,8,3800,374,2042.595,0.011531,0.000385
9,9,3490,310,2224.02894,0.01189,0.000359


## Save Analysis 

In [17]:
df_gpa_threshold.to_csv("analysis/{}_gpa_analysis.csv".format(model_name), index=False)
df_rip_threshold.to_csv("analysis/{}_rip_analysis.csv".format(model_name), index=False)