In [1]:
import numpy as np
import pandas as pd
import json
import dill         # 0.3.2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Set Global/Environment Variables

In [2]:
model_name = "STREAM"

## Load Dataset/Static Param List

In [3]:
original_dataset = pd.concat([pd.read_csv("../로그 데이터/STREAM/STREAM/stream_4034_generic2_{}.csv".format(i), dtype=np.object) for i in reversed(range(1, 11))], axis=0)
original_dataset

Unnamed: 0,extra,time,address_type,gpa,rip,vmid
0,[336633.698810],04:01:00:854206,PF,251797631,18446744072449302655,4034
1,[336633.739463],04:01:00:894857,PF,1007005696,18446744072452043863,4034
2,[336633.867033],04:01:01:022428,PF,906555392,18446744072452043863,4034
3,[336633.901503],04:01:01:056898,PF,1914261504,18446744072452043863,4034
4,[336633.904886],04:01:01:060282,PF,259330048,18446744072452043863,4034
...,...,...,...,...,...,...
180309,[ 1806.225725],07:00:33:381360,PF,602140104,139790387006578,4034
180310,[ 1806.225827],07:00:33:381465,PF,597469140,18446744072443263295,4034
180311,[ 1806.240176],07:00:33:395810,PF,596315344,18446744072442219334,4034
180312,[ 1806.240198],07:00:33:395836,PF,597544452,18446744072441579461,4034


In [4]:
original_dataset = original_dataset[["gpa", "rip"]].dropna()       # rip for PCs
original_dataset

Unnamed: 0,gpa,rip
0,251797631,18446744072449302655
1,1007005696,18446744072452043863
2,906555392,18446744072452043863
3,1914261504,18446744072452043863
4,259330048,18446744072452043863
...,...,...
180309,602140104,139790387006578
180310,597469140,18446744072443263295
180311,596315344,18446744072442219334
180312,597544452,18446744072441579461


## Dataset Processing Functions

In [5]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = X[:-1] - X[1:]
        # In case of unsigned types, change its type to string type
        if X_transformed.dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
            X_transformed = X_transformed.astype(np.string_)
        return X_transformed

    def inverse_transform(self, X, y=None):     # Just for test_pipeline.inverse_transform()
        return X

In [6]:
class NoiseTokenizer(TransformerMixin):
    def __init__(self, minimum_category_occurence=2, oov_token=-1):        
        self.minimum_category_occurence = minimum_category_occurence
        self.oov_token = oov_token
    
    def fit(self, X, y=None):
        if X.dtype == np.string_:
            self.oov_token = str(self.oov_token)

        mask = (pd.Series(X).value_counts() <= self.minimum_category_occurence)
        noise_index = np.where(np.isin(X, mask.index[mask == True]))[0]
    
        X[noise_index] = self.oov_token
        return self
    
    def transform(self, X, y=None):
        return X

    def inverse_transform(self, X, y=None):
        return X

In [7]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:  # X.dtype == |S{0~24}
            # As np.string_ type is byte type, not str(), need to be decoded.
            self.vocabulary = {X_counts.index[i].decode():i for i in range(self.vocab_size)}
        else:
            self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        if X.dtype in [np.dtype("S" + str(i)) for i in range(24)]:
            self.oov_token = str(self.oov_token)
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i].decode()])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])
        else:
            for i in range(len(X)):
                if X[i] in self.word_index:
                    X_transformed.append(self.vocabulary[X[i]])
                else:
                    X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

## Process Train/Validation Dataset

In [8]:
# Train / Val / Test Ratio : 70% / 15% / 15%
train_val_set, _ = train_test_split(original_dataset, test_size=0.15, shuffle=False)

train_val_gpa = train_val_set["gpa"].values.astype(np.int64)
train_val_rip = train_val_set["rip"].values.astype(np.uint64)      # As uint64 not exists in pandas

## Process Dataset per given Dataset Hyperparameters

In [9]:
gpa_threshold_range = (1, 9)
gpa_threshold_interval = 5

rip_threshold_range = (1, 6)
rip_threshold_interval = 3

In [10]:
for gpa_threshold_i in range(*gpa_threshold_range):
    gpa_threshold = gpa_threshold_interval * gpa_threshold_i

    # Make dirs
    os.makedirs(f"experiment/data/gpa/gpa_threshold={gpa_threshold}")
    os.makedirs(f"experiment/static/gpa_threshold={gpa_threshold}")

    gpa_train_pipeline = Pipeline([
        ('calculate_delta', CalculateDelta()),
        ('noise_tokenizer', NoiseTokenizer(minimum_category_occurence=gpa_threshold)),
        ('sparse_category_encoder', SparseCategoryEncoder())
    ])

    # Process
    processed_train_val_gpa = gpa_train_pipeline.fit_transform(train_val_gpa.copy())

    # train/val split
    processed_train_gpa, processed_val_gpa = train_test_split(processed_train_val_gpa, test_size=0.2, shuffle=False)

    # train/val original split
    train_gpa = train_val_gpa[:processed_train_gpa.shape[0]+1]
    val_gpa = train_val_gpa[processed_train_gpa.shape[0]:]

    # to dataframe
    processed_train_gpa = pd.DataFrame(processed_train_gpa, columns=["gpa"], index=None)
    processed_val_gpa = pd.DataFrame(processed_val_gpa, columns=["gpa"], index=None)

    train_gpa = pd.DataFrame(train_gpa, columns=["gpa"], index=None)
    val_gpa = pd.DataFrame(val_gpa, columns=["gpa"], index=None)

    # Save Dataset
    processed_train_gpa.to_csv(f"experiment/data/gpa/gpa_threshold={gpa_threshold}/{model_name}_train_gpa.csv", index=None)
    processed_val_gpa.to_csv(f"experiment/data/gpa/gpa_threshold={gpa_threshold}/{model_name}_val_gpa.csv", index=None)

    train_gpa.to_csv(f"experiment/data/gpa/gpa_threshold={gpa_threshold}/{model_name}_train_gpa_original.csv", index=None)
    val_gpa.to_csv(f"experiment/data/gpa/gpa_threshold={gpa_threshold}/{model_name}_val_gpa_original.csv", index=None)

    # Save pipeline
    with open(f"experiment/static/gpa/gpa_threshold={gpa_threshold}/pipeline_gpa.pkl", 'wb') as f:
        dill.dump(gpa_train_pipeline, f)
    np.savetxt(f"experiment/static/gpa/gpa_threshold={gpa_threshold}/vocabulary_gpa.csv", np.array(list(gpa_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%d", delimiter="\n")

In [10]:
for rip_threshold_i in range(*rip_threshold_range):
    rip_threshold = rip_threshold_interval * rip_threshold_i

    # Make dirs
    os.makedirs(f"experiment/data/rip/rip_threshold={rip_threshold}")
    os.makedirs(f"experiment/static/rip_threshold={rip_threshold}")

    rip_train_pipeline = Pipeline([
        ('calculate_delta', CalculateDelta()),
        ('noise_tokenizer', NoiseTokenizer(minimum_category_occurence=rip_threshold)),
        ('sparse_category_encoder', SparseCategoryEncoder())
    ])

    # Process
    processed_train_val_rip = rip_train_pipeline.fit_transform(train_val_rip.copy())

    # train/val split
    processed_train_rip, processed_val_rip = train_test_split(processed_train_val_rip, test_size=0.2, shuffle=False)

    # train/val original split
    train_rip = train_val_rip[:processed_train_rip.shape[0]+1]
    val_rip = train_val_rip[processed_train_rip.shape[0]:]

    # to dataframe
    processed_train_rip = pd.DataFrame(processed_train_rip, columns=["rip"], index=None)
    processed_val_rip = pd.DataFrame(processed_val_rip, columns=["rip"], index=None)

    train_rip = pd.DataFrame(train_rip, columns=["rip"], index=None)
    val_rip = pd.DataFrame(val_rip, columns=["rip"], index=None)

    # Save Dataset
    processed_train_rip.to_csv(f"experiment/data/rip/rip_threshold={rip_threshold}/{model_name}_train_rip.csv", index=None)
    processed_val_rip.to_csv(f"experiment/data/rip/rip_threshold={rip_threshold}/{model_name}_val_rip.csv", index=None)

    train_rip.to_csv(f"experiment/data/rip/rip_threshold={rip_threshold}/{model_name}_train_rip_original.csv", index=None)
    val_rip.to_csv(f"experiment/data/rip/rip_threshold={rip_threshold}/{model_name}_val_rip_original.csv", index=None)

    # Save pipeline
    with open(f"experiment/static/rip/rip_threshold={rip_threshold}/pipeline_rip.pkl", 'wb') as f:
        dill.dump(rip_train_pipeline, f)
    np.savetxt(f"experiment/static/rip/rip_threshold={rip_threshold}/vocabulary_rip.csv", np.array(list(rip_train_pipeline["sparse_category_encoder"].vocabulary.keys())), fmt="%s", delimiter="\n")