In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
import tensorflow as tf

%matplotlib notebook

from IPython.display import display, HTML

pd.set_option('display.max_columns', None)

from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
def RNNConfig(object):
    def __init__(
        cell_type='lstm', window=20, forget_bias=1.0, 
        n_hidden_cells=(100), keep_prob=1.0, batch_size=64, epoch_num=100,
        learning_rate=0.01, max_grad_norm=1.0, init_scale=0.1,
    ):
        self.cell_type = cell_type
        self.window = window
        self.forget_bias = forget_bias
        self.n_hidden_cells = n_hidden_cells
        self.keep_prob = keep_prob
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.learning_rate = learning_rate
        self.max_grad_norm = max_grad_norm
        self.init_scale = init_scale

In [3]:
data = pd.read_csv(
    "../data/raw/train_values.csv", 
    index_col='row_id', 
    parse_dates=['timestamp'], 
    nrows=100000  
    # We'll just load the first 100K so we know
    # how to load the data using code
)

In [119]:
data.columns

Index(['process_id', 'object_id', 'phase', 'timestamp', 'pipeline',
       'supply_flow', 'supply_pressure', 'return_temperature',
       'return_conductivity', 'return_turbidity', 'return_flow', 'supply_pump',
       'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',
       'return_acid', 'supply_clean_water', 'return_recovery_water',
       'return_drain', 'object_low_level', 'tank_level_pre_rinse',
       'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water',
       'tank_temperature_pre_rinse', 'tank_temperature_caustic',
       'tank_temperature_acid', 'tank_concentration_caustic',
       'tank_concentration_acid', 'tank_lsh_caustic', 'tank_lsh_acid',
       'tank_lsh_clean_water', 'tank_lsh_pre_rinse', 'target_time_period'],
      dtype='object')

# Data preprocessing

## Encoding data

Let's figure out how to preprocess the data into RNN-feedable form.

* Let's separate out `process_id`. We don't want to encode or scale this information. We'll just reattach after encoding and scaling the rest.

In [122]:
process_id_array = data.process_id.values
data.drop(['process_id'], axis=1, inplace=True)

* `phase` will NOT be one-hot-encoded since there is an order to it, so we can simply encode it using 1, 2, .., 5.

In [5]:
phase_categorical = CategoricalDtype(
    categories=['pre_rinse', 'caustic', 'intermediate_rinse', 
                'acid', 'final_rinse'],
    ordered=True
)
data.phase = data.phase.astype(phase_categorical)

data.phase, phase_mapping_idx = data.phase.factorize()

In [131]:
phase_mapping_idx.categories

Index(['pre_rinse', 'caustic', 'intermediate_rinse', 'acid', 'final_rinse'], dtype='object')

* Ensure each sequence is in the right order using the `timestamp` column, but once sequences are set up, discard the column.
  * The column is only useful insofar as it tells us which data point comes before or after others.
  * But then again, **perhaps there is some signal from absoluate passage of time, so we can consider encoding this into UNIX times.** 
 Something to try out.  

In [6]:
data.timestamp = data.timestamp.view(int)

* Ensure each sequence is from single process using `process_id`, but don't include it in data since it really is just an ID column.

* One-hot encode `object_id` and `pipeline`.
  * Although `object_id` is an ID column, there aren't that many objects (~100? **NOT SURE**) and there are multiple processes over each object. So `object_id` may carry valuable information.
  * I'm not sure what `pipeline` is, but we'll treat it as a categorical column
  * Below we first encode categorical columns into integers (and save that mapping in DataframeLabelEncoder.feature_encoder dictionary) and one-hot-encode them. (sklearn's OneHotEncoder requires that input is alreay integer-encoded.)

In [16]:
categorical_features = ['object_id', 'pipeline',]

In [100]:
from sklearn.preprocessing import LabelEncoder


class DataframeLabelEncoder(object):
    def __init__(self, categorical_features):
        assert isinstance(categorical_features, (list, np.ndarray))
        self.categorical_features = categorical_features
        self.feature_encoder = dict()
        self.cat_feature_mask = None
        
    def fit(self, dataframe):
        assert isinstance(dataframe, pd.DataFrame)
        
        if self.cat_feature_mask is None:
            self.cat_feature_mask = np.zeros(shape=dataframe.shape[1], dtype=bool)
        
        for i, feature in enumerate(categorical_features):
            le = LabelEncoder()
            le.fit(dataframe[feature])
            self.feature_encoder[feature] = le
            
            index = np.where(dataframe.columns == feature)[0][0]
            self.cat_feature_mask[index] = True
        
    def transform(self, dataframe):
        assert dataframe.shape[1] == self.cat_feature_mask.shape[-1]
        
        array = dataframe.values.copy()
        for feature in categorical_features:
            encoded = self.feature_encoder[feature].transform(
                dataframe[feature]
            )
            index = np.where(dataframe.columns == feature)[0][0]
            array[:, index] = encoded
            
        return array
    
    def fit_transform(self, dataframe):
        self.fit(dataframe)
        return self.transform(dataframe)

In [137]:
label_encoder = DataframeLabelEncoder(categorical_features)
label_encoder.fit(data)
encoded_data = label_encoder.transform(data)

In [138]:
encoded_data.shape

(100000, 34)

In [139]:
# Create One-hot encoder
ohe = OneHotEncoder(categorical_features=label_encoder.cat_feature_mask, sparse=False)

In [140]:
encoded_data = ohe.fit_transform(encoded_data)

In [141]:
encoded_data.shape

(100000, 84)

It is the expected shape: # non categorical features = 34-2 = 32, + # unique objects = 43, + # unique pipelines = 9. Thats 84.

## Scaling data

From EDA, it looks like some are normally distributed, some exponential... we'll just standardize.

I created the following thinking that we should only standardize columns that were originally numerical (and not boolean or categorical), but it's too much work.. I think just standardizing everything is fine!!

In [118]:
class NumericColumnsStandardScaler(object):
    def __init__(self, ohe_feature_indices):
        assert isinstance(ohe_feature_indices, (list, np.ndarray))
        self._ohe_feature_indices = ohe_feature_indices
        self._scaler = StandardScaler()
        self.numerical_mask = None
        
    def fit(self, data):
        assert isinstance(data, np.array)
        assert data.shape[1] == self.num_feature_mask.shape[-1]
        
        # we only want features that were numerical from the beginning
        # So we create a mask that excludes all columns that are results of
        # one-hot-encoding categorical columns.
        mask = np.ones(data.shape[1], type=bool)
        mask[:self._ohe_feature_indices[-1]] = 0
        self.numerical_mask = mask
        
        self._scaler.fit(data[self.numerical_mask])
        
    def transform(self, data):
        assert isinstance(data, np.array)
        assert data.shape[1] == self.num_feature_mask.shape[-1]
        scaled_num_data = self._scaler.transform(data[self.numerical_mask])
        res = np.concatenate((data[~self.numerical_mask], scaled_num_data), axis=1)
        print(res.shape)
        return res
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [9]:
labels = pd.read_csv("../data/raw/train_labels.csv")

In [142]:
preprocessor = Pipeline(
    steps=[
        ('label_encoder', label_encoder),
        ('one_hot_encoder', ohe),
        ('scaler', StandardScaler().fit(encoded_data))
    ]
)

NOW THAT's OUR PREPROCESSOR!!! HOORAY!

Ok let's just create the padded array now. (What Holden did!) 

From our EDA, we know the maximum length of a process is **15107**.
We also know there are 34 columns (excluding process id), but that gets encoded to **84 columns.** Following Holdens' work, 

> (m, T, f) - where m is the number of unique processes, T is the number of time-sequences, and f is the number of features

Our final array shall have shape $(82, 15107, 84)$

In [145]:
len(np.unique(process_id_array))

82

**Note that we've only loaded first 100K rows. Update this part if you're using your own sampled data or the whole dataset!! # of unique process ids, max lengt will be different.**

In [150]:
(process_id_array == pid).shape

(100000,)

In [147]:
arr = np.zeros((82, 15107, 84))
for i, pid in enumerate(np.unique(process_id_array)):
    pid_mask = process_id_array == pid  # mask for this process id
    turbidity = labels[pid_mask]
    preprocessed_data = preprocessor.transform(data[pid_mask])
    
    nrows = pid_mask.sum()
    arr[i, :nrows, :] = preprocessed_data

ValueError: Item wrong length 100000 instead of 5021.

## Reshaping data

But first, we need to figure out how we're going to approach this problem.

Key things to consider:

* **Sequence length**: the processes have different lengths
  * Not only that, the longest process has 15107 data points---very long!!
* **Prediction output**: for each process (=sequence), predict **one** value.

### Turning variable-length sequences into fixed-length?
Say our lookback window size is $L$. We want to reshape data so that each process is expressed in $n$ sequences of length $L$ where $n= \text{length of process} - L + 1$. 

At the end of each process, i.e. at $n$th sequence, the RNN's state must be refreshed.

It is probably the best to create an iterator rather than making and actual array of all of this. Some iterator that managers each process and lookback length as well as signal to refresh RNN states.

Following suggestions from https://danijar.com/variable-sequence-lengths-in-tensorflow/

In [None]:
def length(sequence):
  used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
  length = tf.reduce_sum(used, 1)
  length = tf.cast(length, tf.int32)
  return length

In [None]:
def batch_iterator(dataX, dataY, batch_size, num_steps):
    data_len = len(dataY)
    batch_len = int(data_len / batch_size)

    if batch_len == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(batch_len):
        input_x = dataX[i * batch_size: (i + 1) * batch_size]
        input_y = dataY[i * batch_size: (i + 1) * batch_size]

        yield (input_x, input_y)

In [None]:
def many_to_one_model_reproduce(output_dim=1):
    model = tf.keras.models.Sequential(
        layers=[
            tf.keras.layers.LSTM(
                rnn_cell_hidden_dim, input_shape=[14, 17], return_sequences=True,
                activation='softsign'),
            tf.keras.layers.LSTM(
                rnn_cell_hidden_dim, activation='softsign'),
            tf.keras.layers.Dense(
                output_dim,
                activation='softmax' if output_dim == 2 else 'sigmoid'),
        ]
    )
    loss = 'binary_crossentropy' if output_dim == 1 else 'categorical_crossentropy'
    adam = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(
        loss=loss, optimizer=adam,
        metrics=['accuracy', precision, recall, f1_score]
    )

    return model