In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
def remove_outliers(readings):
    """
    Remove recordings which have ultimate gauge readings out of
    the realm of possibility.

    Parameters:
    readings: a dataframe of gauge data with an "expected" column
              representing the hourly measurement
    returns: a copy of the original dataframe
    """
    threshold = 70
    lower_map = {i: i.lower() for i in readings.columns}
    return (readings
            .rename(columns=lower_map)
            .query('expected <= @threshold')
            .copy()
            )


In [6]:
data = pd.read_csv('data/train.csv').pipe(remove_outliers)

In [7]:
data = data.loc[data.id.isin(np.random.choice(data.id.unique(), size=50000))].copy()

In [8]:
len(data)

579273

### RNN Structure

In [9]:
data.isnull().mean()

id                       0.000000
minutes_past             0.000000
radardist_km             0.000000
ref                      0.510785
ref_5x5_10th             0.591198
ref_5x5_50th             0.510752
ref_5x5_90th             0.419730
refcomposite             0.482727
refcomposite_5x5_10th    0.554951
refcomposite_5x5_50th    0.483499
refcomposite_5x5_90th    0.398631
rhohv                    0.618063
rhohv_5x5_10th           0.678835
rhohv_5x5_50th           0.617748
rhohv_5x5_90th           0.544722
zdr                      0.618063
zdr_5x5_10th             0.678835
zdr_5x5_50th             0.617748
zdr_5x5_90th             0.544722
kdp                      0.675856
kdp_5x5_10th             0.734017
kdp_5x5_50th             0.675462
kdp_5x5_90th             0.608912
expected                 0.000000
dtype: float64

In [10]:
data.groupby('id').expected.nunique().describe()

count    48825.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: expected, dtype: float64

In [11]:
id_vals = data.id.unique()
train_index = np.random.choice(id_vals,
                               size=np.ceil(len(id_vals) * (9/10.)).astype(int),
                               replace=False)

In [12]:
train_data = data.loc[data.id.isin(train_index), data.columns != 'expected'].fillna(0)
test_data = data.loc[~data.id.isin(train_index), data.columns != 'expected'].fillna(0)
train_target = data.loc[data.id.isin(train_index)].groupby('id').expected.last().values
test_target = data.loc[~data.id.isin(train_index)].groupby('id').expected.last().values

In [13]:
len(train_data), len(test_data)

(521058, 58215)

In [24]:
train_array = train_data.groupby('id').apply(lambda x: x.values.astype('float')).values
test_array = test_data.groupby('id').apply(lambda x: x.values.astype('float')).values

In [54]:
train_lens = train_data.groupby('id').size().values
test_lens = test_data.groupby('id').size().values

In [55]:
n_inputs = train_array[0].shape[1]
n_steps = max([train_lens.max(), test_lens.max()])
n_neurons = [500, 100]
n_outputs = 1

In [33]:
def pad_array(array, n_vars, n_steps=n_steps):
    out_array = []
    for i in array:
        _ = i.copy()
        _.resize(n_steps,n_vars)
        out_array.append(_)
    return np.array(out_array)

In [39]:
test_pad = pad_array(test_array, n_vars=n_inputs)

In [34]:
tf.reset_default_graph()

In [35]:
X = tf.placeholder(dtype=tf.float32, shape=[None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None])
seq_length = tf.placeholder(dtype=tf.int16, shape=[None])

with tf.variable_scope('rnn', initializer=tf.contrib.layers.variance_scaling_initializer()):
    rnn_cell_1 = tf.contrib.rnn.LSTMCell(num_units=n_neurons[0])
    rnn_cell_2 = tf.contrib.rnn.LSTMCell(num_units=n_neurons[1])
    multi_layer_cell = tf.contrib.rnn.MultiRNNCell([rnn_cell_1, rnn_cell_2])
    outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32,
                                        sequence_length=seq_length)
    drop = tf.layers.dropout(inputs=states[1], rate=0.4)
    y_pred = tf.layers.dense(drop, n_outputs)
    

In [36]:
learning_rate = 0.05

error = (y-y_pred)
loss = tf.reduce_mean(tf.square(error), name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

mae = tf.reduce_mean(tf.abs(error))

In [37]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [57]:
n_samples = train_array.shape[0]
n_epochs = 5
batch_size = len(train_array) / 25
print batch_size


with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        rand_idx = np.random.permutation(np.arange(n_samples))
        for batch_i in range(n_samples // batch_size):
            batch_idx = rand_idx[(batch_i*batch_size):((batch_i+1)*batch_size)]
            X_batch = pad_array(train_array[batch_idx], n_vars=n_inputs)
            y_batch = train_target[batch_idx]
            len_batch = train_lens[batch_idx]
            loss_val, _ = sess.run(
                [loss, training_op],
                feed_dict={X: X_batch, seq_length: len_batch, y: y_batch})
        mae_train = mae.eval(feed_dict={X: X_batch, seq_length: len_batch, y: y_batch})
        mae_test = mae.eval(feed_dict={X: test_pad, seq_length: test_lens, y: test_target})
        print("{:4d}  Train loss: {:.4f}, Train MAE: {:.2f}  Test MAE: {:.2f}".format(
            epoch, loss_val, mae_train, mae_test))
        saver.save(sess, "my_reber_classifier")
        final_pred = y_pred.eval({X: test_pad, seq_length: test_lens, y: test_target})

1757
   0  Train loss: 50.2507, Train MAE: 3.41  Test MAE: 3.40
   1  Train loss: 35.5093, Train MAE: 3.04  Test MAE: 3.14
   2  Train loss: 41.5766, Train MAE: 3.30  Test MAE: 3.33


KeyboardInterrupt: 

In [36]:
np.abs((final_pred - test_target)).argmin()

186

In [37]:
final_pred[186]

array([ 183.58932495], dtype=float32)

In [38]:
test_target

109.47406000000001