In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM, GRU, TimeDistributedDense
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import kagglegym

Using Theano backend.


In [2]:
%matplotlib inline

In [3]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [4]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [5]:
def get_split_data(data):
    has_y = 'y' in data.columns
    data_to_use = data.drop('y', 1) if has_y else data
    best_distribution = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in data_to_use.isnull().values], axis=1)
    nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
    column_splits = [[column for column, included in zip(list(data_to_use.columns), distribution) if included] + (['y'] if has_y else []) for distribution in nan_distributions]
    return [data[split][columns] for split, distribution, columns in zip(nan_structures_split, nan_distributions, column_splits)]

In [6]:
def train_and_test_model(model_generator, data_split_further):
    clfs = []
    total_samples = 0
    total_score = 0.
    for X_train, y_train, X_test, y_test in data_split_further:
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        #print ('Trained on {0}, Tested on {1}'.format(len(X_train), len(X_test)))
        if len(X_test.values) > 0: 
            test_score = clf.score(X_test.values, y_test.values)
            train_score = clf.score(X_train.values, y_train.values)
            #print ('Test Score {0}, Training Score {1}'.format(test_score, train_score))
            total_samples += len(X_test)
            total_score += test_score * len(X_test)
        clfs.append(clf)
    #print ('Average Score ', total_score / total_samples)
    return (clfs, total_score / total_samples)

def train_model(model_generator, data_split):
    clfs = []
    for train in data_split:
        X_train = train.drop('y', axis=1).fillna(0)
        y_train = train['y']
        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        clfs.append(clf)
    return clfs

# For Testing

In [7]:
%%time
data = pd.read_hdf('data/train.h5')
cutoff = int(len(data) * 0.7)

CPU times: user 81 ms, sys: 742 ms, total: 823 ms
Wall time: 1.52 s


In [8]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

6
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx', '  xxxxx x x x  x  x    x  x x xxxxx  xx  x  x x      x x  x x  xx  x x        x               x               ']
CPU times: user 36.3 s, sys: 1.77 s, total: 38.1 s
Wall time: 39.2 s


In [9]:
def data_generator(data, timestamp_start, timestamp_end):
    while True:
        X = []
        y = []
        X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
        y_columns = ['y']
        X_empty_row = data[X_columns].mean().values
        y_empty_row = data[y_columns].mean().values
        for item_id in np.unique(data.id):
            data_for_id = data[data.id == item_id]
            data_for_id_X = data_for_id[X_columns].values
            data_for_id_y = data_for_id[y_columns].values
            data_for_id_timestamp = data_for_id['timestamp'].values
            for i in range(len(data_for_id)):
                if data_for_id_timestamp[i] < timestamp_start or data_for_id_timestamp[i] >= timestamp_end:
                    continue
                X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
                y_to_use = data_for_id_y[max(i - samples_back_included - 1, 0):max(i - 1, 0)]
                if len(X_to_use) != 10:
                    X_to_use = np.concatenate((np.array([X_empty_row] * (10 - len(X_to_use))), X_to_use), axis=0)
                if len(y_to_use) != 10:
                    y_to_use = np.concatenate((np.array([y_empty_row] * (10 - len(y_to_use))), y_to_use), axis=0)
                X.append(np.concatenate((X_to_use, y_to_use), axis=1))
                y.append(data_for_id_y[i])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

In [10]:
def loss_r2(y_true, y_pred):
    u = K.mean(y_true, axis=-1)
    u = K.expand_dims(u, y_true.ndim - 1)
    u = K.repeat_elements(u, y_true.shape[-1], axis=-1)
    r2 = 1 - K.sum(K.square(y_pred - y_true), axis=-1) / K.sum(K.square(y_true - u), axis=-1)
    r = (K.sign(r2)*K.sqrt(K.abs(r2)))
    return K.clip(r, -1., 1.)

def loss_r2_1d(y_true, y_pred):
    u = K.mean(y_true)
    return K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))

def loss_r_score(y_true, y_pred):
    u = K.mean(y_true)
    r2 = 1 - K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))
    return (K.sign(r2)*K.sqrt(K.abs(r2)))
    #return K.clip(r, -1., 1.)

In [14]:
%%time
data = pd.read_hdf('data/train.h5')
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds
#columns_to_normalize = [column for column in data.columns if not column in ['id', 'timestamp', 'y']]
#data.groupby(columns_to_normalize).apply(lambda x: (x - np.mean(x)) / np.std(x))
#test_cutoff = int(len(data) / batch_size) * batch_size

CPU times: user 23 s, sys: 7.85 s, total: 30.8 s
Wall time: 31.4 s


In [17]:
samples_back_included = 10
num_features = 109 # length of X + 1 extra for y
batch_size = 256

train_bounds = (0, 400)
test_bounds = (400, 700) # 1812
epochs = 5
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / 2) * batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size

In [19]:
def plot(history):
    loss = np.array(history.history['loss_r_score'])
    val_loss = np.array(history.history['val_loss_r_score'])
    plt.semilogy(np.exp(loss))
    plt.semilogy(np.exp(val_loss))
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [23]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

Epoch 1/5
  5888/165888 [>.............................] - ETA: 226s - loss: 1562.8999 - loss_r_score: -38.2813

KeyboardInterrupt: 

In [None]:
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

train_gen = data_generator(data, train_bounds[0], train_bounds[1])
test_gen = data_generator(data, test_bounds[0], test_bounds[1])
history_128_128 = model.fit_generator(train_gen, samples_per_epoch=train_samples, validation_data=test_gen, nb_val_samples=test_samples, nb_epoch=epochs, verbose=1,)
plot(history_128_128)

# For Submission

In [30]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train

CPU times: user 409 ms, sys: 2.64 s, total: 3.05 s
Wall time: 5.04 s


In [31]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data)
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

5
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx']
CPU times: user 25.3 s, sys: 979 ms, total: 26.3 s
Wall time: 26.7 s


In [32]:
samples_back_included = 10
num_features = 108 # length of X + 1 extra for y
batch_size = 256

train_bounds = (0, 400)
test_bounds = (400, 700) # 1812
epochs = 5
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / 2) * batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size

low_y_cut = -0.075
high_y_cut = 0.075

In [33]:
%%time
data = data.fillna(data.mean())
means, stds = data.mean(), data.std()
for column in ['id', 'timestamp', 'y']:
    means[column] = 0
    stds[column] = 1
data = (data - means) / stds

CPU times: user 10.8 s, sys: 1.83 s, total: 12.7 s
Wall time: 12.6 s


In [35]:
%%time
model = Sequential()
model.add(GRU(64, batch_input_shape=[batch_size, samples_back_included, num_features], return_sequences=True, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(GRU(128, return_sequences=False, stateful=True))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss=loss_r2_1d,
              optimizer='adam',
              metrics=[loss_r_score])

CPU times: user 290 ms, sys: 23.2 ms, total: 313 ms
Wall time: 333 ms


In [36]:
def data_generator(data):
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    X_empty_row = data[X_columns].mean().values
    y_empty_row = data[y_columns].mean().values
    X = []
    y = []
    while True:
        for item_id in np.unique(data.id):
            data_for_id = data[data.id == item_id]
            data_for_id_X = data_for_id[X_columns].values
            data_for_id_y = data_for_id[y_columns].values
            for i in range(len(data_for_id)):
                X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
                if len(X_to_use) != 10:
                    X_to_use = np.concatenate((np.array([X_empty_row] * (10 - len(X_to_use))), X_to_use), axis=0)
                X.append(X_to_use)
                y.append(data_for_id_y[i])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

In [None]:
%%time
train_gen = data_generator(data, train_bounds[0], train_bounds[1])
model.fit_generator(train_gen, samples_per_epoch=train_samples, nb_epoch=epochs, verbose=1)

In [90]:
%%time
while True:
    target = observation.target
    features = observation.features
    features = features.fillna(features.mean())
    features = (features - means) / stds
    
    
    
    observation, reward, done, info = env.step(target)
    if done:
        print("Finished, reward: ", info["public_score"])
        break
    if observation.features.timestamp[0] % 100 == 0:
        print(reward)

-0.11128681189
0.0825276854025
-0.0548432430466
-0.15183885798
-0.0923139783666
-0.0825648098422
-0.107279092471
-0.0520323144763
-0.360260755881
-0.197959841749
-0.0816455593487
-0.133597131291
-0.138718911415
0.0298962237767
-0.12717418969
-0.114115171981
-0.0943293327787
-0.0396678680466


KeyboardInterrupt: 

In [27]:
%%time
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

env = kagglegym.make()
o = env.reset()
excl = ['id', 'sample', 'y', 'timestamp']
col = [c for c in o.train.columns if c not in excl]

train = pd.read_hdf('data/train.h5')
train = train[col]
d_mean= train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in train.columns:
    train[c + '_nan_'] = pd.isnull(train[c])
    d_mean[c + '_nan_'] = 0
train = train.fillna(d_mean)
train['znull'] = n
n = []

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 15.8 s, sys: 12.5 s, total: 28.3 s
Wall time: 38.2 s


In [28]:
%%time

rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
model1 = rfr.fit(train, o.train['y'])

#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
model2 = LinearRegression(n_jobs=-1)
model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])
train = []

#https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
ymean_dict = dict(o.train.groupby(["id"])["y"].median())

CPU times: user 36min 4s, sys: 21.5 s, total: 36min 25s
Wall time: 11min 14s


In [29]:
%%time
while True:
    test = o.features[col]
    n = test.isnull().sum(axis=1)
    for c in test.columns:
        test[c + '_nan_'] = pd.isnull(test[c])
    test = test.fillna(d_mean)
    test['znull'] = n
    pred = o.target
    test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
    pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
    pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
    pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
    o, reward, done, info = env.step(pred)
    if done:
        print("el fin ...", info["public_score"])
        break
    if o.features.timestamp[0] % 100 == 0:
        print(reward)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


-0.317732370655
-0.182276665545
-0.168791068514
0.0110777780049
-0.124967695052
-0.166028695524
-0.208487325573
-0.304625644472
-0.0215625001216
el fin ... 0.0215442626819
CPU times: user 19min 27s, sys: 14.5 s, total: 19min 41s
Wall time: 21min 14s
