In [24]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import gc
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM, GRU, TimeDistributedDense
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import kagglegym
import time

Using Theano backend.


In [25]:
%matplotlib inline

In [26]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [27]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [28]:
def loss_r2(y_true, y_pred):
    u = K.mean(y_true, axis=-1)
    u = K.expand_dims(u, y_true.ndim - 1)
    u = K.repeat_elements(u, y_true.shape[-1], axis=-1)
    r2 = 1 - K.sum(K.square(y_pred - y_true), axis=-1) / K.sum(K.square(y_true - u), axis=-1)
    r = (K.sign(r2)*K.sqrt(K.abs(r2)))
    return K.clip(r, -1., 1.)

def loss_r2_1d(y_true, y_pred):
    u = K.mean(y_true)
    return K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))

def loss_r_score(y_true, y_pred):
    u = K.mean(y_true)
    r2 = 1 - K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))
    return (K.sign(r2)*K.sqrt(K.abs(r2)))
    #return K.clip(r, -1., 1.)

# For Testing

In [7]:
%%time
data = pd.read_hdf('data/train.h5')

CPU times: user 81 ms, sys: 742 ms, total: 823 ms
Wall time: 1.52 s


In [8]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

6
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx', '  xxxxx x x x  x  x    x  x x xxxxx  xx  x  x x      x x  x x  xx  x x        x               x               ']
CPU times: user 36.3 s, sys: 1.77 s, total: 38.1 s
Wall time: 39.2 s


In [14]:
%%time
data = pd.read_hdf('data/train.h5')
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds

CPU times: user 23 s, sys: 7.85 s, total: 30.8 s
Wall time: 31.4 s


In [22]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 256

train_bounds = (0, 400)
test_bounds = (400, 700) # 1812
epochs = 5
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / 2) * batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size

In [19]:
def plot(history):
    loss = np.array(history.history['loss_r_score'])
    val_loss = np.array(history.history['val_loss_r_score'])
    plt.semilogy(np.exp(loss))
    plt.semilogy(np.exp(val_loss))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [None]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

# For Submission

In [29]:
num_features = 108

In [30]:
def data_generator(data_for_id_X, data_for_id_y, id_range, samples_back_included, batch_size):
    X = []
    y = []
    while True:
        for j in range(id_range):
            for i in range(samples_back_included, len(data_for_id_X[j])):
                X.append(data_for_id_X[j][max(i - samples_back_included, 0):i])
                y.append(data_for_id_y[j][i-1:i][0][0])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

def create_and_train_model(env, observation, batch_size, epochs, passes, samples_back_included, dropout, first_layer_size, second_layer_size):
    global num_features
    
    data = observation.train
    data = data.fillna(data.mean())
    means, stds = data.mean(), data.std()
    for column in ['id', 'timestamp', 'y']:
        means[column] = 0
        stds[column] = 1
    data = (data - means) / stds

    train_samples = int(len(data) / batch_size / (epochs / passes)) * batch_size
                        
    ids = np.unique(data.id)
    id_range = int(max(ids) - min(ids) + 1)
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    X_padding = np.zeros((samples_back_included - 1, num_features), dtype=np.float)
    y_padding = np.zeros((samples_back_included - 1, 1), dtype=np.float)

    data_for_id_X = [[] for i in range(id_range)]
    data_for_id_y = [[] for i in range(id_range)]

    for item_id in range(id_range):
        data_for_id_X[int(item_id)] = X_padding
        data_for_id_y[int(item_id)] = y_padding

    for item_id in ids:
        data_for_id_X[int(item_id)] = np.concatenate((X_padding, data[data.id == item_id][X_columns].values), axis=0)
        data_for_id_y[int(item_id)] = np.concatenate((y_padding, data[data.id == item_id][y_columns].values), axis=0)

    data = None
    gc.collect()
    
    model = Sequential()
    model.add(GRU(first_layer_size, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    model.add(Dropout(dropout))
    model.add(GRU(second_layer_size, return_sequences=False, stateful=True))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss=loss_r2_1d,
                  optimizer='adam',
                  metrics=[loss_r_score])

    train_gen = data_generator(data_for_id_X, data_for_id_y, id_range, samples_back_included, batch_size)
    history = model.fit_generator(train_gen, samples_per_epoch=train_samples, nb_epoch=epochs, verbose=0)

    return means, stds, data_for_id_X, model, history

def test_model(env, observation, means, stds, data_for_id_X, model, batch_size, low_y_cut, high_y_cut, samples_back_included):
    global num_features
    
    X_columns = [item for item in observation.features.columns if item not in ('id', 'timestamp')]
    X_padding = np.zeros((samples_back_included - 1, num_features), dtype=np.float)
    full_reward = 0
    while True:
        target = observation.target
        features = observation.features
        features = features.fillna(0)
        features = (features - means) / stds
        ids = observation.features['id'].values
        for row_id, row in zip(ids, features[X_columns].values):
            if row_id >= len(data_for_id_X):
                difference = row_id - len(data_for_id_X) + 1
                data_for_id_X.extend([X_padding] * difference)
            data_for_id_X[row_id] = np.concatenate((data_for_id_X[row_id], [row]), axis=0)

        X_to_predict = np.array([data_for_id_X[row_id][-samples_back_included:] for row_id in ids])

        # add extra rows to fit batch_size
        batches = math.ceil(len(X_to_predict) / batch_size)
        extra_predictions = batch_size * batches - len(X_to_predict)
        X_to_predict = np.concatenate((X_to_predict, np.zeros((extra_predictions, samples_back_included, 108), dtype=np.int)), axis=0)

        target_size = len(target.y)
        target.y = model.predict(X_to_predict, batch_size=batch_size)[:target_size].clip(low_y_cut, high_y_cut)
        observation, reward, done, info = env.step(target)
        if done:
            print("Finished, reward: ", info["public_score"])
            return info["public_score"]
        full_reward += reward
        if observation.features.timestamp[0] % 200 == 0:
            #print(full_reward / 100)
            full_reward = 0

In [15]:
#results_array = pd.read_csv('data/model_results').values

In [31]:
def test_params(args):
    global results_array
    start_time = time.time()
    results_dict_key = ', '.join('{0}: {1}'.format(key, args[key]) for key in args)
    default_args = {
        'batch_size': 256,
        'epochs': 8,
        'passes': 2,
        'low_y_cut': -0.075,
        'high_y_cut': 0.075,
        'samples_back_included': 8,
        'dropout': 0.5,
        'first_layer_size': 64,
        'second_layer_size': 128
    }
    env = kagglegym.make()
    observation = env.reset()
    train_args = dict((arg, args[arg] if arg in args else default_args[arg]) for arg in default_args if arg in ('batch_size', 'epochs', 'passes', 'samples_back_included', 'dropout', 'first_layer_size', 'second_layer_size'))
    test_args = dict((arg, args[arg] if arg in args else default_args[arg]) for arg in default_args if arg in ('batch_size', 'samples_back_included', 'low_y_cut', 'high_y_cut'))
    means, stds, data_for_id_X, model, history = create_and_train_model(env, observation, **train_args)
    score = test_model(env, observation, means, stds, data_for_id_X, model, **test_args)
    
    results_array = np.append(results_array, [[results_dict_key, score, history, start_time - time.time()]], axis=0)

In [41]:
pd.DataFrame(results_array).to_csv('data/model_results', index=False)

In [38]:
%%time
args = {
    'batch_size': 256,
    'dropout': 0.3,
    'epochs': 4,
    'passes': 2,
    'first_layer_size': 32,
    'second_layer_size': 64,
}
test_params(args)

Finished, reward:  -0.0201369730804
CPU times: user 16min 8s, sys: 4min 24s, total: 20min 33s
Wall time: 18min 37s


In [22]:
test = [(row[0], row[1]) for row in results_array]
sorted(test, key=lambda row: -row[1])

[('epochs: 8, batch_size: 256, dropout: 0.5, passes: 2',
  -0.006045661023625851),
 ('first_layer_size: 64, second_layer_size: 128, batch_size: 256, dropout: 0.5, passes: 2, epochs: 4',
  -0.00914250721655),
 ('first_layer_size: 64, second_layer_size: 64, batch_size: 256, dropout: 0.5, passes: 2, epochs: 4',
  -0.00922768453504),
 ('epochs: 4, batch_size: 512, dropout: 0.6, passes: 1',
  -0.011255786431362215),
 ('epochs: 4, batch_size: 512, dropout: 0.4, passes: 2',
  -0.011561177423752131),
 ('dropout: 0.5, first_layer_size: 32, second_layer_size: 32, passes: 1',
  -0.012636470105368416),
 ('epochs: 4, batch_size: 256, dropout: 0.6, passes: 2', -0.01349307690811253),
 ('dropout: 0.6, first_layer_size: 32, second_layer_size: 32, passes: 1',
  -0.015118321402473916),
 ('epochs: 8, batch_size: 256, dropout: 0.4, passes: 2',
  -0.016239682554597634),
 ('first_layer_size: 32, second_layer_size: 64, batch_size: 256, dropout: 0.5, passes: 1, epochs: 4',
  -0.0163946674037),
 ('first_layer_s

# Benchmark

In [1]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

env = kagglegym.make()
o = env.reset()
o.train = o.train[:1000]
excl = ['id', 'sample', 'y', 'timestamp']
col = [c for c in o.train.columns if c not in excl]

train = pd.read_hdf('data/train.h5')
train = train[col]
d_mean= train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in train.columns:
    train[c + '_nan_'] = pd.isnull(train[c])
    d_mean[c + '_nan_'] = 0
train = train.fillna(d_mean)
train['znull'] = n
n = []

rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
model1 = rfr.fit(train, o.train['y'])
train = []

#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
# 0.075?
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
model2 = LinearRegression(n_jobs=-1)
model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])

#https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
ymean_dict = dict(o.train.groupby(["id"])["y"].median())

while True:
    full_reward = 0
    test = o.features[col]
    n = test.isnull().sum(axis=1)
    for c in test.columns:
        test[c + '_nan_'] = pd.isnull(test[c])
    test = test.fillna(d_mean)
    test['znull'] = n
    pred = o.target
    test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
    pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
    pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
    pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
    o, reward, done, info = env.step(pred)
    if done:
        print("el fin ...", info["public_score"])
        break
        
    full_reward += reward
    if o.features.timestamp[0] % 100 == 0:
        print(full_reward / 100)
        full_reward = 0
# started 11:00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


-0.00428660145471
-0.00217862142254
-0.00181563727073
-0.00116765333644
-0.00235118682059
-0.00325396125482
-0.00175334010547
-0.00440014228298
0.0017928211677
el fin ... -0.162135141475
