In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM, GRU, TimeDistributedDense
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import kagglegym

Using Theano backend.


In [2]:
%matplotlib inline

In [3]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [4]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

# For Testing

In [7]:
%%time
data = pd.read_hdf('data/train.h5')
cutoff = int(len(data) * 0.7)

CPU times: user 81 ms, sys: 742 ms, total: 823 ms
Wall time: 1.52 s


In [8]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

6
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx', '  xxxxx x x x  x  x    x  x x xxxxx  xx  x  x x      x x  x x  xx  x x        x               x               ']
CPU times: user 36.3 s, sys: 1.77 s, total: 38.1 s
Wall time: 39.2 s


In [10]:
def loss_r2(y_true, y_pred):
    u = K.mean(y_true, axis=-1)
    u = K.expand_dims(u, y_true.ndim - 1)
    u = K.repeat_elements(u, y_true.shape[-1], axis=-1)
    r2 = 1 - K.sum(K.square(y_pred - y_true), axis=-1) / K.sum(K.square(y_true - u), axis=-1)
    r = (K.sign(r2)*K.sqrt(K.abs(r2)))
    return K.clip(r, -1., 1.)

def loss_r2_1d(y_true, y_pred):
    u = K.mean(y_true)
    return K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))

def loss_r_score(y_true, y_pred):
    u = K.mean(y_true)
    r2 = 1 - K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))
    return (K.sign(r2)*K.sqrt(K.abs(r2)))
    #return K.clip(r, -1., 1.)

In [14]:
%%time
data = pd.read_hdf('data/train.h5')
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds
#columns_to_normalize = [column for column in data.columns if not column in ['id', 'timestamp', 'y']]
#data.groupby(columns_to_normalize).apply(lambda x: (x - np.mean(x)) / np.std(x))
#test_cutoff = int(len(data) / batch_size) * batch_size

CPU times: user 23 s, sys: 7.85 s, total: 30.8 s
Wall time: 31.4 s


In [17]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 256

train_bounds = (0, 400)
test_bounds = (400, 700) # 1812
epochs = 5
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / 2) * batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size

In [19]:
def plot(history):
    loss = np.array(history.history['loss_r_score'])
    val_loss = np.array(history.history['val_loss_r_score'])
    plt.semilogy(np.exp(loss))
    plt.semilogy(np.exp(val_loss))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [23]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

Epoch 1/5
  5888/165888 [>.............................] - ETA: 226s - loss: 1562.8999 - loss_r_score: -38.2813

KeyboardInterrupt: 

In [None]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

# For Submission

In [30]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train

CPU times: user 409 ms, sys: 2.64 s, total: 3.05 s
Wall time: 5.04 s


In [31]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data)
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

5
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx']
CPU times: user 25.3 s, sys: 979 ms, total: 26.3 s
Wall time: 26.7 s


In [43]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 1024

epochs = 6
train_samples = int(len(data) / batch_size / 2) * batch_size

low_y_cut = -0.075
high_y_cut = 0.075

In [33]:
%%time
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds

CPU times: user 10.8 s, sys: 1.83 s, total: 12.7 s
Wall time: 12.6 s


In [44]:
%%time
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

CPU times: user 255 ms, sys: 7.4 ms, total: 263 ms
Wall time: 259 ms


In [45]:
def data_generator(data):
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    X_empty_row = data[X_columns].mean().values
    y_empty_row = data[y_columns].mean().values
    X = []
    y = []
    while True:
        for item_id in np.unique(data.id):
            data_for_id = data[data.id == item_id]
            data_for_id_X = data_for_id[X_columns].values
            data_for_id_y = data_for_id[y_columns].values
            for i in range(len(data_for_id)):
                X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
                if len(X_to_use) != 10:
                    X_to_use = np.concatenate((np.array([X_empty_row] * (10 - len(X_to_use))), X_to_use), axis=0)
                X.append(X_to_use)
                y.append(data_for_id_y[i])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

In [51]:
np.unique(observation.features.timestamp)

array([906], dtype=int16)

In [49]:
np.unique(data.timestamp)

array([   0.,    1.,    2.,    3.,    4.,    5.,    6.,    7.,    8.,
          9.,   10.,   11.,   12.,   13.,   14.,   15.,   16.,   17.,
         18.,   19.,   20.,   21.,   22.,   23.,   24.,   25.,   26.,
         27.,   28.,   29.,   30.,   31.,   32.,   33.,   34.,   35.,
         36.,   37.,   38.,   39.,   40.,   41.,   42.,   43.,   44.,
         45.,   46.,   47.,   48.,   49.,   50.,   51.,   52.,   53.,
         54.,   55.,   56.,   57.,   58.,   59.,   60.,   61.,   62.,
         63.,   64.,   65.,   66.,   67.,   68.,   69.,   70.,   71.,
         72.,   73.,   74.,   75.,   76.,   77.,   78.,   79.,   80.,
         81.,   82.,   83.,   84.,   85.,   86.,   87.,   88.,   89.,
         90.,   91.,   92.,   93.,   94.,   95.,   96.,   97.,   98.,
         99.,  100.,  101.,  102.,  103.,  104.,  105.,  106.,  107.,
        108.,  109.,  110.,  111.,  112.,  113.,  114.,  115.,  116.,
        117.,  118.,  119.,  120.,  121.,  122.,  123.,  124.,  125.,
        126.,  127.,

In [None]:
%%time
train_gen = data_generator(data)
model.fit_generator(train_gen, samples_per_epoch=train_samples, nb_epoch=epochs, verbose=1)

In [90]:
%%time
while True:
    target = observation.target
    features = observation.features
    features = features.fillna(features.mean())
    features = (features - means) / stds
    y = model.predict()
    
    
    observation, reward, done, info = env.step(target)
    if done:
        print("Finished, reward: ", info["public_score"])
        break
    if observation.features.timestamp[0] % 100 == 0:
        print(reward)

-0.11128681189
0.0825276854025
-0.0548432430466
-0.15183885798
-0.0923139783666
-0.0825648098422
-0.107279092471
-0.0520323144763
-0.360260755881
-0.197959841749
-0.0816455593487
-0.133597131291
-0.138718911415
0.0298962237767
-0.12717418969
-0.114115171981
-0.0943293327787
-0.0396678680466


KeyboardInterrupt: 