In [2]:
import tensorflow as tf
from tensorflow import keras
import kerastuner
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime
import dill
import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
dataset_name = "SEG_Wavenet"

In [4]:
train_original = np.genfromtxt("data/{}_train_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32
test_original = np.genfromtxt("data/{}_test_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32
val_original = np.genfromtxt("data/{}_val_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32

In [5]:
train_val_original = np.r_[train_original, val_original]

In [6]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X[:-1] - X[1:]

In [7]:
class SimpleSparseCategoryEncoder(TransformerMixin):
    def __init__(self, vocabulary_list):
        self.vocabulary_list = vocabulary_list

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        vocabulary = {v:i for i, v in enumerate(self.vocabulary_list)}

        X_transformed = []
        for i in tqdm.trange(len(X)):
            if X[i] in self.vocabulary_list:
                X_transformed.append(vocabulary[X[i]])
            else:
                X_transformed.append(0)        # (vocabulary_list[0] for oov)

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.vocabulary_list[x] for x in X])

In [8]:
train_val_delta = train_val_original[:-1] - train_val_original[1:]
train_val_vocabulary = pd.Series(train_val_delta).value_counts().index.to_numpy()
train_val_vocabulary = list(np.r_[[-1], train_val_vocabulary])
train_val_vocabulary[:10]

[-1, 0, -4096, -909517620, 909517620, -8192, 8, 4096, -8, -12288]

In [9]:
SEG_Wavenet_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('sparse_category_encoder', SimpleSparseCategoryEncoder(train_val_vocabulary))
])

In [10]:
train_set = SEG_Wavenet_pipeline.transform(train_original)
val_set = SEG_Wavenet_pipeline.transform(val_original)
test_set = SEG_Wavenet_pipeline.transform(test_original)

100%|██████████| 161291/161291 [00:19<00:00, 8207.89it/s]
100%|██████████| 40323/40323 [00:04<00:00, 8743.01it/s]
100%|██████████| 35579/35579 [00:05<00:00, 6423.95it/s]


In [17]:
np.savetxt("data/{}_train_set_2.csv".format(dataset_name), train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set_2.csv".format(dataset_name), val_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set_2.csv".format(dataset_name), test_set, fmt="%d", delimiter="\n")

In [18]:
np.savetxt("data/vocabulary_2.csv", np.array(train_val_vocabulary), fmt="%d", delimiter="\n")