In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
import tensorflow as tf
import tqdm

%matplotlib notebook

from IPython.display import display, HTML

pd.set_option('display.max_columns', None)

from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
def RNNConfig(object):
    def __init__(
        cell_type='lstm', window=20, forget_bias=1.0, 
        n_hidden_cells=(100), keep_prob=1.0, batch_size=64, epoch_num=100,
        learning_rate=0.01, max_grad_norm=1.0, init_scale=0.1,
    ):
        self.cell_type = cell_type
        self.window = window
        self.forget_bias = forget_bias
        self.n_hidden_cells = n_hidden_cells
        self.keep_prob = keep_prob
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.learning_rate = learning_rate
        self.max_grad_norm = max_grad_norm
        self.init_scale = init_scale

In [4]:
!ls 'Data'

submission_format.csv  test_values.csv	train_labels.csv  train_values.csv


In [5]:
data = pd.read_csv(
    "Data/train_values.csv", 
    index_col='row_id', 
    parse_dates=['timestamp'], 
    nrows=100000  
    # We'll just load the first 100K so we know
    # how to load the data using code
)

In [6]:
data.columns

Index(['process_id', 'object_id', 'phase', 'timestamp', 'pipeline',
       'supply_flow', 'supply_pressure', 'return_temperature',
       'return_conductivity', 'return_turbidity', 'return_flow', 'supply_pump',
       'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',
       'return_acid', 'supply_clean_water', 'return_recovery_water',
       'return_drain', 'object_low_level', 'tank_level_pre_rinse',
       'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water',
       'tank_temperature_pre_rinse', 'tank_temperature_caustic',
       'tank_temperature_acid', 'tank_concentration_caustic',
       'tank_concentration_acid', 'tank_lsh_caustic', 'tank_lsh_acid',
       'tank_lsh_clean_water', 'tank_lsh_pre_rinse', 'target_time_period'],
      dtype='object')

# Data PreProcessing

## Encoding



Let's figure out how to preprocess the data into RNN-feedable form.

    Let's separate out process_id. We don't want to encode or scale this information. We'll just reattach after encoding and scaling the rest.



In [7]:
process_id_array = data.process_id.values
data.drop(['process_id'], axis=1, inplace=True)



    phase will NOT be one-hot-encoded since there is an order to it, so we can simply encode it using 1, 2, .., 5.



In [8]:
phase_categorical = CategoricalDtype(
    categories=['pre_rinse', 'caustic', 'intermediate_rinse', 
                'acid', 'final_rinse'],
    ordered=True
)
data.phase = data.phase.astype(phase_categorical)

data.phase, phase_mapping_idx = data.phase.factorize()

In [9]:
data.phase.astype(phase_categorical)

row_id
0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
99970    NaN
99971    NaN
99972    NaN
99973    NaN
99974    NaN
99975    NaN
99976    NaN
99977    NaN
99978    NaN
99979    NaN
99980    NaN
99981    NaN
99982    NaN
99983    NaN
99984    NaN
99985    NaN
99986    NaN
99987    NaN
99988    NaN
99989    NaN
99990    NaN
99991    NaN
99992    NaN
99993    NaN
99994    NaN
99995    NaN
99996    NaN
99997    NaN
99998    NaN
99999    NaN
Name: phase, Length: 100000, dtype: category
Categories (5, object): [pre_rinse < caustic < intermediate_rinse < acid < final_rinse]

In [10]:
phase_mapping_idx.categories

Index(['pre_rinse', 'caustic', 'intermediate_rinse', 'acid', 'final_rinse'], dtype='object')

Ensure each sequence is in the right order using the timestamp column, but once sequences are set up, discard the column.  
-The column is only useful insofar as it tells us which data point comes before or after others.  
-But then again, perhaps there is some signal from absoluate passage of time, so we can consider encoding this into UNIX times. Something to try out.

In [11]:
data.timestamp = data.timestamp.view(int)

    Ensure each sequence is from single process using process_id, but don't include it in data since it really is just an ID column.

    One-hot encode object_id and pipeline.
        Although object_id is an ID column, there aren't that many objects (~100? NOT SURE) and there are multiple processes over each object. So object_id may carry valuable information.
        I'm not sure what pipeline is, but we'll treat it as a categorical column
        Below we first encode categorical columns into integers (and save that mapping in DataframeLabelEncoder.feature_encoder dictionary) and one-hot-encode them. (sklearn's OneHotEncoder requires that input is alreay integer-encoded.)



In [12]:
categorical_features = ['object_id', 'pipeline',]

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
class DataframeLabelEncoder(object):
    def __init__(self, categorical_features):
        assert isinstance(categorical_features, (list, np.ndarray))
        self.categorical_features = categorical_features
        self.feature_encoder = dict()
        self.cat_feature_mask = None
        
    def fit(self, dataframe):
        assert isinstance(dataframe, pd.DataFrame)
        
        if self.cat_feature_mask is None:
            self.cat_feature_mask = np.zeros(shape=dataframe.shape[1], dtype=bool)
        
        for i, feature in enumerate(categorical_features):
            le = LabelEncoder()
            le.fit(dataframe[feature])
            self.feature_encoder[feature] = le
            
            index = np.where(dataframe.columns == feature)[0][0]
            self.cat_feature_mask[index] = True
        
    def transform(self, dataframe):
        assert dataframe.shape[1] == self.cat_feature_mask.shape[-1]
        
        array = dataframe.values.copy()
        for feature in categorical_features:
            encoded = self.feature_encoder[feature].transform(
                dataframe[feature]
            )
            index = np.where(dataframe.columns == feature)[0][0]
            array[:, index] = encoded
            
        return array
    
    def fit_transform(self, dataframe):
        self.fit(dataframe)
        return self.transform(dataframe)

In [15]:
label_encoder = DataframeLabelEncoder(categorical_features)
label_encoder.fit(data)
encoded_data = label_encoder.transform(data)

In [16]:
encoded_data.shape

(100000, 34)

In [17]:
# Create One-hot encoder
ohe = OneHotEncoder(categorical_features=label_encoder.cat_feature_mask, sparse=False)

In [18]:
encoded_data = ohe.fit_transform(encoded_data)

In [19]:
encoded_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
encoded_data.shape

(100000, 84)

It is the expected shape: # non categorical features = 34-2 = 32, + # unique objects = 43, + # unique pipelines = 9. Thats 84.

## Scaling Data

From EDA, it looks like some are normally distributed, some exponential... we'll just standardize.

I created the following thinking that we should only standardize columns that were originally numerical (and not boolean or categorical), but it's too much work.. I think just standardizing everything is fine!!

In [21]:
class NumericColumnsStandardScaler(object):
    def __init__(self, ohe_feature_indices):
        assert isinstance(ohe_feature_indices, (list, np.ndarray))
        self._ohe_feature_indices = ohe_feature_indices
        self._scaler = StandardScaler()
        self.numerical_mask = None
        
    def fit(self, data):
        assert isinstance(data, np.array)
        assert data.shape[1] == self.num_feature_mask.shape[-1]
        
        # we only want features that were numerical from the beginning
        # So we create a mask that excludes all columns that are results of
        # one-hot-encoding categorical columns.
        mask = np.ones(data.shape[1], type=bool)
        mask[:self._ohe_feature_indices[-1]] = 0
        self.numerical_mask = mask
        
        self._scaler.fit(data[self.numerical_mask])
        
    def transform(self, data):
        assert isinstance(data, np.array)
        assert data.shape[1] == self.num_feature_mask.shape[-1]
        scaled_num_data = self._scaler.transform(data[self.numerical_mask])
        res = np.concatenate((data[~self.numerical_mask], scaled_num_data), axis=1)
        print(res.shape)
        return res
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [22]:
labels = pd.read_csv("Data/train_labels.csv")

In [23]:
preprocessor = Pipeline(
    steps=[
        ('label_encoder', label_encoder),
        ('one_hot_encoder', ohe),
        ('scaler', StandardScaler().fit(encoded_data))
    ]
)



NOW THAT's OUR PREPROCESSOR!!! HOORAY!

Ok let's just create the padded array now. (What Holden did!)

From our EDA, we know the maximum length of a process is 15107. We also know there are 34 columns (excluding process id), but that gets encoded to 84 columns. Following Holdens' work,



    (m, T, f) - where m is the number of unique processes, T is the number of time-sequences, and f is the number of features

Our final array shall have shape $(82, 15107, 84)$

In [24]:
len(np.unique(process_id_array))

82

In [25]:
arr = np.zeros((82, 15107, 84))
for i, pid in enumerate(np.unique(process_id_array)):
    pid_mask = process_id_array == pid  # mask for this process id
    preprocessed_data = preprocessor.transform(data[pid_mask])
    
    nrows = pid_mask.sum()
    arr[i, :nrows, :] = preprocessed_data

In [26]:
(process_id_array == pid).shape

(100000,)

In [27]:
pid_mask.sum()

782

In [28]:
arr.shape

(82, 15107, 84)

In [29]:
arr[81]

array([[-0.19735816, -0.16218562, -0.14506451, ...,  0.        ,
         0.        , -0.36748879],
       [-0.19735816, -0.16218562, -0.14506451, ...,  0.        ,
         0.        , -0.36748879],
       [-0.19735816, -0.16218562, -0.14506451, ...,  0.        ,
         0.        , -0.36748879],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Prep Labels

In [30]:
subset_labels = labels[0:arr.shape[0]]

In [31]:
subset_labels.shape

(82, 2)

In [32]:
subset_labels.drop(['process_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [33]:
subset_labels

Unnamed: 0,final_rinse_total_turbidity_liter
0,4.318275e+06
1,4.375286e+05
2,4.271977e+05
3,7.197830e+05
4,4.133107e+05
5,4.100847e+06
6,3.058669e+06
7,2.706871e+06
8,5.217553e+05
9,1.037243e+05


### Train/Test Split

In [34]:
from sklearn.cross_validation import train_test_split



In [35]:
X_train, X_test, y_train, y_test = train_test_split(arr, subset_labels, test_size=0.1, random_state=1)

# Model

In [36]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

Using TensorFlow backend.


In [37]:
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=2, shuffle=False)

Train on 73 samples, validate on 9 samples
Epoch 1/50
 - 97s - loss: 1433786.2962 - val_loss: 2326203.0000
Epoch 2/50
 - 92s - loss: 1433786.2962 - val_loss: 2326203.0000
Epoch 3/50
