In [14]:
import tensorflow as tf
from tensorflow import keras
import kerastuner
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime
import dill
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
dataset_name = "SEG_Wavenet"

In [58]:
train_original = np.genfromtxt("data/{}_train_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32
test_original = np.genfromtxt("data/{}_test_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32
val_original = np.genfromtxt("data/{}_val_set_original.csv".format(dataset_name), delimiter="\n", dtype=np.int64) #np.float32

In [15]:
class CalculateDelta(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.array([X[i+1] - X[i] for i in range(int(len(X))-1)])

In [50]:
class Quantizer(TransformerMixin):
    def __init__(self, quantization_channels=2**8, oov_token=-1):
        self.quantization_channels = quantization_channels
        self.oov_token = oov_token
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        mask = pd.Series(X).value_counts()[self.quantization_channels-1:].index
        noise_index = np.where(np.isin(X, mask))
        X[noise_index] = self.oov_token
        return X

In [51]:
class SparseCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, oov_token=-1):
        self.oov_token = oov_token

    def fit(self, X, y=None):
        X_counts = pd.Series(X).value_counts()
        self.vocab_size = len(X_counts)
        self.word_index = X_counts.index
        self.vocabulary = {X_counts.index[i]:i for i in range(self.vocab_size)}
        
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for i in range(len(X)):
            if X[i] in self.word_index:
                X_transformed.append(self.vocabulary[X[i]])
            else:
                X_transformed.append(self.vocabulary[self.oov_token])

        return np.array(X_transformed)

    def inverse_transform(self, X, y=None):
        return np.array([self.word_index[X[i]] for i in range(len(X))])

In [67]:
SEG_Wavenet_pipeline = Pipeline([
    ('calculate_delta', CalculateDelta()),
    ('quantizer', Quantizer()),
    ('sparse_category_encoder', SparseCategoryEncoder())
])

In [68]:
train_set = SEG_Wavenet_pipeline.fit_transform(train_original.copy())
train_set

array([0, 0, 0, ..., 0, 0, 0])

In [69]:
val_set = SEG_Wavenet_pipeline.transform(val_original.copy())
val_set

array([0, 0, 0, ..., 0, 0, 0])

In [70]:
test_set = SEG_Wavenet_pipeline.transform(test_original.copy())
test_set

array([188,   0,   0, ...,   0,   6,   0])

In [72]:
np.savetxt("data/{}_train_set_quantized.csv".format(dataset_name), train_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_val_set_quantized.csv".format(dataset_name), val_set, fmt="%d", delimiter="\n")
np.savetxt("data/{}_test_set_quantized.csv".format(dataset_name), test_set, fmt="%d", delimiter="\n")