In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "STREAM"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/STREAM/STREAM/stream_4034_generic2_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[336633.698810],04:01:00:854206,PF,251797631,18446744072449302655,4034
1,[336633.739463],04:01:00:894857,PF,1007005696,18446744072452043863,4034
2,[336633.867033],04:01:01:022428,PF,906555392,18446744072452043863,4034
3,[336633.901503],04:01:01:056898,PF,1914261504,18446744072452043863,4034
4,[336633.904886],04:01:01:060282,PF,259330048,18446744072452043863,4034
...,...,...,...,...,...,...
180309,[ 1806.225725],07:00:33:381360,PF,602140104,139790387006578,4034
180310,[ 1806.225827],07:00:33:381465,PF,597469140,18446744072443263295,4034
180311,[ 1806.240176],07:00:33:395810,PF,596315344,18446744072442219334,4034
180312,[ 1806.240198],07:00:33:395836,PF,597544452,18446744072441579461,4034


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,251797631,18446744072449302655
1,1007005696,18446744072452043863
2,906555392,18446744072452043863
3,1914261504,18446744072452043863
4,259330048,18446744072452043863
...,...,...
180309,602140104,139790387006578
180310,597469140,18446744072443263295
180311,596315344,18446744072442219334
180312,597544452,18446744072441579461


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

## Process Train/Validation Dataset

In [6]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, test_set = train_test_split(original_dataset, test_size=0.15, shuffle=False)
#train_set, val_set = train_test_split(train_val_set, test_size=0.2, shuffle=False)

In [7]:
train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

test_gpa = test_set["gpa"].values.astype(np.int64)
test_rip = test_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

In [8]:
calculate_delta = CalculateDelta()

In [9]:
processed_train_val_gpa = calculate_delta.fit_transform(train_val_gpa.copy())
processed_train_val_gpa

array([ -755208065,   100450304, -1007706112, ...,   -31219712,
          -8601600,   507019264], dtype=int64)

In [10]:
processed_train_val_rip = calculate_delta.transform(train_val_rip)
processed_train_val_rip

array([b'18446744073706810408', b'0', b'0', ..., b'0', b'0', b'0'],
      dtype='|S20')

In [11]:
pd.Series(processed_train_val_gpa).value_counts(), pd.Series(processed_train_val_rip).value_counts()

(-4096          713760
  0             405516
 -12288          36270
 -8192           21875
 -643931498       9063
                 ...  
  860061696          1
  742146048          1
 -425086271          1
 -1042427904         1
  3033688            1
 Length: 674207, dtype: int64,
 b'0'                       2591789
 b'18446744073709551579'      34063
 b'37'                        32504
 b'18446744073709551612'      28812
 b'4'                         28736
                             ...   
 b'139723050761760'               1
 b'18446744073708997824'          1
 b'18446744073709547792'          1
 b'282921'                        1
 b'6484272'                       1
 Length: 49547, dtype: int64)

In [12]:
# Index = Occurence | Column = Number of included categories
pd.Series(processed_train_val_gpa).value_counts().value_counts(), pd.Series(processed_train_val_rip).value_counts().value_counts()

(1      385925
 2      129019
 3       58791
 4       27044
 5       13694
         ...  
 735         1
 223         1
 204         1
 203         1
 995         1
 Length: 264, dtype: int64,
 1        37698
 2         4610
 3         1890
 4         1120
 5          703
          ...  
 5485         1
 207          1
 251          1
 4377         1
 34063        1
 Length: 375, dtype: int64)

## gpa : N / P ratio

In [13]:
gpa_data = []
for gpa_threshold in range(50):
    row = []
    
    row.append(gpa_threshold)
    if gpa_threshold == 0:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts())
        previous_gpa_p = gpa_p
        previous_accuracy = 0
        previous_ratio = 0
    else:
        gpa_p = len(pd.Series(processed_train_val_gpa).value_counts()) - sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values) + 1
    row.append(gpa_p)
    row.append(previous_gpa_p - gpa_p)
    previous_gpa_p = gpa_p
    ratio = processed_train_val_gpa.shape[0] / gpa_p
    row.append(ratio)
    row.append(ratio - previous_ratio)
    previous_ratio = ratio
    accuracy = sum(pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].values * pd.Series(processed_train_val_gpa).value_counts().value_counts().loc[list(range(1, gpa_threshold+1))].index) / (processed_train_val_gpa.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    gpa_data.append(row)

df_gpa_threshold = pd.DataFrame(data=gpa_data, columns=["threshold", "p", "p'", "n/p ratio", "ratio'", "upper limit for train/val accuracy", "accuracy'"])

In [14]:
df_gpa_threshold

Unnamed: 0,threshold,p,p',n/p ratio,ratio',upper limit for train/val accuracy,accuracy'
0,0,674207,0,5.24088,5.24088,0.0,0.0
1,1,288283,385924,12.256838,7.015958,0.109221,0.109221
2,2,159264,129019,22.186043,9.929205,0.182248,0.073027
3,3,100473,58791,35.168035,12.981992,0.232164,0.049915
4,4,73429,27044,48.12047,12.952435,0.262779,0.030615
5,5,59735,13694,59.151888,11.031417,0.282156,0.019378
6,6,51757,7978,68.269761,9.117873,0.295704,0.013547
7,7,46554,5203,75.899772,7.630012,0.306011,0.010308
8,8,42579,3975,82.985462,7.08569,0.315011,0.009
9,9,39380,3199,89.726714,6.741252,0.323159,0.008148


## rip : N / P ratio

In [15]:
rip_data = []
for rip_threshold in range(50):
    row = []
    
    row.append(rip_threshold)
    if rip_threshold == 0:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts())
        previous_rip_p = rip_p
        previous_accuracy = 0
        previous_ratio = 0
    else:
        rip_p = len(pd.Series(processed_train_val_rip).value_counts()) - sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values) + 1
    row.append(rip_p)
    row.append(previous_rip_p - rip_p)
    previous_rip_p = rip_p
    ratio = processed_train_val_rip.shape[0] / rip_p
    row.append(ratio)
    row.append(ratio - previous_ratio)
    previous_ratio = ratio
    accuracy = sum(pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].values * pd.Series(processed_train_val_rip).value_counts().value_counts().loc[list(range(1, rip_threshold+1))].index) / (processed_train_val_rip.shape[0])
    row.append(accuracy)
    row.append(accuracy - previous_accuracy)
    previous_accuracy = accuracy

    rip_data.append(row)

df_rip_threshold = pd.DataFrame(data=rip_data, columns=["threshold", "p", "p'", "n/p ratio", "ratio'", "upper limit for train/val accuracy", "accuracy'"])

In [16]:
df_rip_threshold

Unnamed: 0,threshold,p,p',n/p ratio,ratio',upper limit for train/val accuracy,accuracy'
0,0,49547,0,71.314873,71.314873,0.0,0.0
1,1,11850,37697,298.180422,226.865549,0.010669,0.010669
2,2,7240,4610,488.043923,189.863501,0.013278,0.002609
3,3,5350,1890,660.455701,172.411778,0.014883,0.001605
4,4,4230,1120,835.328132,174.872431,0.016151,0.001268
5,5,3527,703,1001.825347,166.497215,0.017146,0.000995
6,6,2977,550,1186.912328,185.086981,0.01808,0.000934
7,7,2538,439,1392.213554,205.301226,0.018949,0.00087
8,8,2215,323,1595.231603,203.018049,0.019681,0.000731
9,9,1959,256,1803.694742,208.46314,0.020333,0.000652


## Save Analysis 

In [17]:
df_gpa_threshold.to_csv("analysis/{}_gpa_analysis.csv".format(model_name), index=False)
df_rip_threshold.to_csv("analysis/{}_rip_analysis.csv".format(model_name), index=False)