In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import gc
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM, GRU, TimeDistributedDense
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import kagglegym

Using Theano backend.


In [2]:
%matplotlib inline

In [3]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [4]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [11]:
def loss_r2(y_true, y_pred):
    u = K.mean(y_true, axis=-1)
    u = K.expand_dims(u, y_true.ndim - 1)
    u = K.repeat_elements(u, y_true.shape[-1], axis=-1)
    r2 = 1 - K.sum(K.square(y_pred - y_true), axis=-1) / K.sum(K.square(y_true - u), axis=-1)
    r = (K.sign(r2)*K.sqrt(K.abs(r2)))
    return K.clip(r, -1., 1.)

def loss_r2_1d(y_true, y_pred):
    u = K.mean(y_true)
    return K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))

def loss_r_score(y_true, y_pred):
    u = K.mean(y_true)
    r2 = 1 - K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))
    return (K.sign(r2)*K.sqrt(K.abs(r2)))
    #return K.clip(r, -1., 1.)

# For Testing

In [7]:
%%time
data = pd.read_hdf('data/train.h5')

CPU times: user 81 ms, sys: 742 ms, total: 823 ms
Wall time: 1.52 s


In [8]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

6
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx', '  xxxxx x x x  x  x    x  x x xxxxx  xx  x  x x      x x  x x  xx  x x        x               x               ']
CPU times: user 36.3 s, sys: 1.77 s, total: 38.1 s
Wall time: 39.2 s


In [14]:
%%time
data = pd.read_hdf('data/train.h5')
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds
#columns_to_normalize = [column for column in data.columns if not column in ['id', 'timestamp', 'y']]
#data.groupby(columns_to_normalize).apply(lambda x: (x - np.mean(x)) / np.std(x))
#test_cutoff = int(len(data) / batch_size) * batch_size

CPU times: user 23 s, sys: 7.85 s, total: 30.8 s
Wall time: 31.4 s


In [17]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 256

train_bounds = (0, 400)
test_bounds = (400, 700) # 1812
epochs = 5
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / 2) * batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size

In [19]:
def plot(history):
    loss = np.array(history.history['loss_r_score'])
    val_loss = np.array(history.history['val_loss_r_score'])
    plt.semilogy(np.exp(loss))
    plt.semilogy(np.exp(val_loss))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [23]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

Epoch 1/5
  5888/165888 [>.............................] - ETA: 226s - loss: 1562.8999 - loss_r_score: -38.2813

KeyboardInterrupt: 

In [None]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

# For Submission

In [18]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train

CPU times: user 498 ms, sys: 3.2 s, total: 3.7 s
Wall time: 7.17 s


In [6]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 512

epochs = 6
train_samples = int(len(data) / batch_size / 3) * batch_size

low_y_cut = -0.075
high_y_cut = 0.075

In [7]:
%%time
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds

CPU times: user 10.4 s, sys: 1.64 s, total: 12 s
Wall time: 11.4 s


In [8]:
def data_generator(data_for_id_X, data_for_id_y):
    X = []
    y = []
    while True:
        for j in range(id_range):
            for i in range(samples_back_included, len(data_for_id_X[j])):
                X.append(data_for_id_X[j][max(i - samples_back_included, 0):i])
                y.append(data_for_id_y[j][i-1:i][0][0])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

In [9]:
%%time
ids = np.unique(data.id)
id_range = int(max(ids) - min(ids) + 1)
X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
y_columns = ['y']
X_padding = np.zeros((samples_back_included - 1, 108), dtype=np.float)
y_padding = np.zeros((samples_back_included - 1, 1), dtype=np.float)
#TODO (try padding these?)
# X_to_use = np.concatenate((np.array([X_empty_row] * (samples_back_included - len(X_to_use))), X_to_use), axis=0)
data_for_id_X = [[] for i in range(id_range)]
data_for_id_y = [[] for i in range(id_range)]

for item_id in range(id_range):
    data_for_id_X[int(item_id)] = X_padding
    data_for_id_y[int(item_id)] = y_padding

for item_id in ids:
    data_for_id_X[int(item_id)] = np.concatenate((X_padding, data[data.id == item_id][X_columns].values), axis=0)
    data_for_id_y[int(item_id)] = np.concatenate((y_padding, data[data.id == item_id][y_columns].values), axis=0)
    
data = None
gc.collect()

CPU times: user 7.51 s, sys: 643 ms, total: 8.15 s
Wall time: 8.2 s


In [12]:
%%time
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

CPU times: user 290 ms, sys: 9.45 ms, total: 299 ms
Wall time: 303 ms


In [13]:
%%time
train_gen = data_generator(data_for_id_X, data_for_id_y)
model.fit_generator(train_gen, samples_per_epoch=train_samples, nb_epoch=epochs, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
CPU times: user 19min 25s, sys: 3min 12s, total: 22min 38s
Wall time: 18min 11s


In [19]:
#data_for_id_X_backup = list(data_for_id_X)
data_for_id_X = list(data_for_id_X_backup)

In [20]:
#%%time
#3:12
while True:
    target = observation.target
    features = observation.features
    features = features.fillna(features.mean())
    features = (features - means) / stds
    ids = observation.features['id'].values
    for row_id, row in zip(ids, features[X_columns].values):
        if row_id >= len(data_for_id_X):
            difference = row_id - len(data_for_id_X) + 1
            data_for_id_X.extend([X_padding] * difference)
        data_for_id_X[row_id] = np.concatenate((data_for_id_X[row_id], [row]), axis=0)

    X_to_predict = np.array([data_for_id_X[row_id][-samples_back_included:] for row_id in ids])
    
    # add extra rows to fit batch_size
    batches = math.ceil(len(X_to_predict) / batch_size)
    extra_predictions = batch_size * batches - len(X_to_predict)
    X_to_predict = np.concatenate((X_to_predict, np.zeros((extra_predictions, 10, 108), dtype=np.int)), axis=0)
    
    target_size = len(target.y)
    target.y = model.predict(X_to_predict, batch_size=batch_size)[:target_size].clip(low_y_cut, high_y_cut)
    observation, reward, done, info = env.step(target)
    if done:
        print("Finished, reward: ", info["public_score"])
        break
    if observation.features.timestamp[0] % 100 == 0:
        print(reward)

-0.206543678917
-0.270901853357
-0.0475962028599
-0.0970229409771
0.0130554934178
-0.120568030836
-0.123249530374
-0.376818604827
-0.117119038913
Finished, reward:  -0.0855655926856
