In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
def remove_outliers(readings):
    """
    Remove recordings which have ultimate gauge readings out of
    the realm of possibility.

    Parameters:
    readings: a dataframe of gauge data with an "expected" column
              representing the hourly measurement
    returns: a copy of the original dataframe
    """
    threshold = 70
    lower_map = {i: i.lower() for i in readings.columns}
    return (readings
            .rename(columns=lower_map)
            .query('expected >= @threshold')
            .copy()
            )


In [4]:
data = pd.read_csv('data/train.csv').pipe(remove_outliers).sample(10000)

In [5]:
len(data)

10000

### RNN Structure

In [6]:
data.isnull().mean()

id                       0.0000
minutes_past             0.0000
radardist_km             0.0000
ref                      0.8894
ref_5x5_10th             0.9163
ref_5x5_50th             0.8923
ref_5x5_90th             0.8510
refcomposite             0.8796
refcomposite_5x5_10th    0.9069
refcomposite_5x5_50th    0.8834
refcomposite_5x5_90th    0.8384
rhohv                    0.9126
rhohv_5x5_10th           0.9365
rhohv_5x5_50th           0.9175
rhohv_5x5_90th           0.8834
zdr                      0.9126
zdr_5x5_10th             0.9365
zdr_5x5_50th             0.9175
zdr_5x5_90th             0.8834
kdp                      0.9282
kdp_5x5_10th             0.9462
kdp_5x5_50th             0.9300
kdp_5x5_90th             0.9084
expected                 0.0000
dtype: float64

In [7]:
data.groupby('id').expected.nunique().describe()

count    9545.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: expected, dtype: float64

In [8]:
id_vals = data.id.unique()
train_index = np.random.choice(id_vals,
                               size=np.ceil(len(id_vals) * (9/10.)).astype(int),
                               replace=False)

In [9]:
train_data = data.loc[data.id.isin(train_index), data.columns != 'expected'].fillna(0)
test_data = data.loc[~data.id.isin(train_index), data.columns != 'expected'].fillna(0)
train_target = data.loc[data.id.isin(train_index)].groupby('id').expected.last().values
test_target = data.loc[~data.id.isin(train_index)].groupby('id').expected.last().values

In [10]:
len(train_data), len(test_data)

(9000, 1000)

In [11]:
train_array = train_data.groupby('id').apply(lambda x: x.values.astype('float').tolist()).tolist()
test_array = test_data.groupby('id').apply(lambda x: x.values.astype('float').tolist()).tolist()

In [12]:
train_lens = np.asarray([len(x) for x in train_array])
test_lens = np.asarray([len(x) for x in test_array])

In [31]:
n_inputs = train_data.shape[1]
n_steps = max([train_lens.max(), test_lens.max()])
n_neurons = [500, 100]
n_outputs = 1
pad_val = np.zeros(n_inputs).tolist()

In [32]:
def pad_array(arr):
    for i, a in enumerate(arr):
        alen = len(a)
        lendiff = n_steps - alen
        if lendiff > 0:
            arr[i] = a + ([pad_val] * lendiff)
    return np.asarray(arr)

In [33]:
test_nd = pad_array(test_array)

In [48]:
tf.reset_default_graph()

In [49]:
X = tf.placeholder(dtype=tf.float32, shape=[None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None])
seq_length = tf.placeholder(dtype=tf.int16, shape=[None])

with tf.variable_scope('rnn', initializer=tf.contrib.layers.variance_scaling_initializer()):
    rnn_cell_1 = tf.contrib.rnn.LSTMCell(num_units=n_neurons[0])
    rnn_cell_2 = tf.contrib.rnn.LSTMCell(num_units=n_neurons[1])
    multi_layer_cell = tf.contrib.rnn.MultiRNNCell([rnn_cell_1, rnn_cell_2])
    outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32,
                                        sequence_length=seq_length)
    drop = tf.layers.dropout(inputs=states[1], rate=0.4)
    y_pred = tf.layers.dense(drop, n_outputs)
    

In [50]:
learning_rate = 0.05

error = (y-y_pred)
loss = tf.reduce_mean(tf.square(error), name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

mae = tf.reduce_mean(tf.abs(error))

In [51]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [52]:
n_epochs = 50
batch_size = len(train_array) / 25
print batch_size

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        X_batches = np.array_split(train_array, len(train_array) // batch_size)
        l_batches = np.array_split(train_lens, len(train_lens) // batch_size)
        y_batches = np.array_split(train_target, len(train_target) // batch_size)
        for X_batch, l_batch, y_batch in zip(X_batches, l_batches, y_batches):
            X_pad = pad_array(X_batch.tolist())
            loss_val, _ = sess.run(
                [loss, training_op],
                feed_dict={X: X_pad, seq_length: l_batch, y: y_batch})
        mae_train = mae.eval(feed_dict={X: X_pad, seq_length: l_batch, y: y_batch})
        mae_test = mae.eval(feed_dict={X: test_nd, seq_length: test_lens, y: test_target})
        print("{:4d}  Train loss: {:.4f}, Train MAE: {:.2f}  Test MAE: {:.2f}".format(
            epoch, loss_val, mae_train, mae_test))
        saver.save(sess, "my_reber_classifier")
    final_pred = y_pred.eval({X: test_nd, seq_length: test_lens, y: test_target})

343
   0  Train loss: 3346202.5000, Train MAE: 1259.00  Test MAE: 1282.03
   1  Train loss: 3127096.0000, Train MAE: 1194.15  Test MAE: 1217.68
   2  Train loss: 2935564.0000, Train MAE: 1157.31  Test MAE: 1179.17
   3  Train loss: 2768169.7500, Train MAE: 1130.56  Test MAE: 1151.82
   4  Train loss: 2616956.0000, Train MAE: 1111.56  Test MAE: 1131.57
   5  Train loss: 2488672.2500, Train MAE: 1096.82  Test MAE: 1114.12
   6  Train loss: 2378216.7500, Train MAE: 1088.49  Test MAE: 1103.82
   7  Train loss: 2283448.2500, Train MAE: 1084.39  Test MAE: 1097.82
   8  Train loss: 2202549.7500, Train MAE: 1083.16  Test MAE: 1093.65
   9  Train loss: 2134619.7500, Train MAE: 1083.39  Test MAE: 1092.14
  10  Train loss: 2077451.8750, Train MAE: 1086.52  Test MAE: 1091.06
  11  Train loss: 2030436.1250, Train MAE: 1089.89  Test MAE: 1093.18
  12  Train loss: 1991350.0000, Train MAE: 1093.16  Test MAE: 1093.17
  13  Train loss: 1959500.8750, Train MAE: 1096.62  Test MAE: 1094.72
  14  Train loss

KeyboardInterrupt: 

In [27]:
type(final_pred)

numpy.ndarray

In [36]:
np.abs((final_pred - test_target)).argmin()

186

In [37]:
final_pred[186]

array([ 183.58932495], dtype=float32)

In [38]:
test_target

109.47406000000001