In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras

%matplotlib notebook

from IPython.display import display, HTML

pd.set_option('display.max_columns', None)

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [40]:
from pandas.api.types import CategoricalDtype

In [4]:
def RNNConfig(object):
    def __init__(
        cell_type='lstm', window=20, forget_bias=1.0, 
        n_hidden_cells=(100), keep_prob=1.0, batch_size=64, epoch_num=100,
        learning_rate=0.01, max_grad_norm=1.0, init_scale=0.1,
    ):
        self.cell_type = cell_type
        self.window = window
        self.forget_bias = forget_bias
        self.n_hidden_cells = n_hidden_cells
        self.keep_prob = keep_prob
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.learning_rate = learning_rate
        self.max_grad_norm = max_grad_norm
        self.init_scale = init_scale

In [82]:
data = pd.read_csv(
    "data/raw/train_values.csv", 
    index_col='row_id', 
    parse_dates=['timestamp'], 
    nrows=100000  
    # We'll just load the first 100K so we know
    # how to load the data using code
)

# Data preprocessing

## Encoding data

Let's figure out how to preprocess the data into RNN-feedable form.

* `phase` will NOT be one-hot-encoded since there is an order to it, so we can simply encode it using 1, 2, .., 5.

In [83]:
phase_categorical = CategoricalDtype(
    categories=['pre_rinse', 'caustic', 'intermediate_rinse', 
                'acid', 'final_rinse'],
    ordered=True
)
data.phase = data.phase.astype(phase_categorical)

data.phase, phase_mapping_idx = data.phase.factorize()

* Ensure each sequence is in the right order using the `timestamp` column, but once sequences are set up, discard the column.
  * The column is only useful insofar as it tells us which data point comes before or after others.
  * But then again, perhaps there is some signal from absoluate passage of time, so we can consider encoding this into UNIX times. Something to try out.  

In [84]:
data.timestamp = data.timestamp.view(int)

* Ensure each sequence is from single process using `process_id`, but don't include it in data since it really is just an ID column.

* One-hot encode `object_id` and `pipeline`.
  * Although `object_id` is an ID column, there aren't that many objects (~100? **NOT SURE**) and there are multiple processes over each object. So `object_id` may carry valuable information.
  * I'm not sure what `pipeline` is, but we'll treat it as a categorical column

In [47]:
categorical_features = ['object_id', 'pipeline',]

In [85]:
# encode to integer first, and also get mask of categorical features
cat_feature_mask = np.zeros(shape=data.shape[1], dtype=int)
feature_categories = dict()
for i, col in enumerate(data.columns):
    if col in categorical_features:
        # encode to integer
        categories = np.sort(data[col].unique())
        data[col] = categories.searchsorted(data[col])
        feature_categories[col] = categories
        
        # update mask
        cat_feature_mask[i] = 1
    
# Create One-hot encoder
ohe = OneHotEncoder(categorical_features=cat_feature_mask)
data = ohe.fit(data)

Note that we've only fitted the one-hot encoder. Using `transform` returns a numpy output. Will deal with this later.

## Reshaping data

Say our lookback window size is $L$. We want to reshape data so that each process is expressed in $n$ sequences of length $L$ where $n= \text{length of process} - L + 1$. 

At the end of each process, i.e. at $n$th sequence, the RNN's state must be refreshed.

It is probably the best to create an iterator rather than making and actual array of all of this. Some iterator that managers each process and lookback length as well as signal to refresh RNN states.

In [None]:
def batch_iterator(dataX, dataY, batch_size, num_steps):
    data_len = len(dataY)
    batch_len = int(data_len / batch_size)

    if batch_len == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(batch_len):
        input_x = dataX[i * batch_size: (i + 1) * batch_size]
        input_y = dataY[i * batch_size: (i + 1) * batch_size]

        yield (input_x, input_y)

In [None]:
def many_to_one_model_reproduce(output_dim=1):
    model = tf.keras.models.Sequential(
        layers=[
            tf.keras.layers.LSTM(
                rnn_cell_hidden_dim, input_shape=[14, 17], return_sequences=True,
                activation='softsign'),
            tf.keras.layers.LSTM(
                rnn_cell_hidden_dim, activation='softsign'),
            tf.keras.layers.Dense(
                output_dim,
                activation='softmax' if output_dim == 2 else 'sigmoid'),
        ]
    )
    loss = 'binary_crossentropy' if output_dim == 1 else 'categorical_crossentropy'
    adam = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(
        loss=loss, optimizer=adam,
        metrics=['accuracy', precision, recall, f1_score]
    )

    return model