In [1]:
import pandas as pd
import numpy as np
import math
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM, GRU, TimeDistributedDense
from keras import backend as K
import kagglegym
import time
import gc

Using Theano backend.


In [2]:
def get_nan_distributions(features):
    missing_values = {}
    for row in features.values:
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    
    # TODO this sucks, fix it
    cutoff = 1000
    high_frequency_nan_distributions_filtered = []
    while len(high_frequency_nan_distributions_filtered) < 5:
        high_frequency_nan_distributions_filtered = []
        for dist, count in high_frequency_nan_distributions:
            new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
            if ((count**0.5) * (new**2)) > cutoff:
                high_frequency_nan_distributions_filtered.append(dist)
        cutoff -= 20
    
    return np.array(high_frequency_nan_distributions_filtered)

def add_nan_distributions_and_normalize_old(data):
    features = data.drop(['id', 'timestamp', 'y'], axis=1)
    nan_distributions = get_nan_distributions(features, data)
    best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
    nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
    nan_features[np.arange(len(best_distributions)), best_distributions] = 1
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    nan_features = pd.DataFrame(nan_features, columns=nan_columns)
    data = data.fillna(data.mean())
    
    means, stds = data.mean(), data.std()
    non_feature_columns = ['id', 'timestamp', 'y']
    for column in non_feature_columns:
        means[column] = 0
        stds[column] = 1
    data = (data - means) / stds
    new_data = data.join(nan_features)
    
    means = means.drop(non_feature_columns)
    stds = stds.drop(non_feature_columns)
    return new_data, nan_distributions, means, stds


columns_to_use = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19']

def add_nan_distributions_and_normalize(data):
    #low_y_cut = -0.086093
    #high_y_cut = 0.093497
    #y_is_above_cut = (data.y > high_y_cut)
    #y_is_below_cut = (data.y < low_y_cut)
    #y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
    #data = data.loc[y_is_within_cut]
    #data.index = list(range(len(data)))
    
    for c in data.columns:
        data[c + '_nan_'] = pd.isnull(data[c])
    n = data.isnull().sum(axis=1)
    train['znull'] = n
    
    features = data[columns_to_use]
    nan_distributions = get_nan_distributions(features)
    best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
    nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
    nan_features[np.arange(len(best_distributions)), best_distributions] = 1
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    nan_features = pd.DataFrame(nan_features, columns=nan_columns)
    
    non_feature_columns = ['id', 'timestamp', 'y']
    means, stds = features.mean(), features.std()
    features = (features.fillna(means) - means) / stds
    new_data = features.join(nan_features).join(data[non_feature_columns])
    
    means = means[columns_to_use]
    stds = stds[columns_to_use]
    return new_data, nan_distributions, means, stds

In [3]:
def loss_r2(y_true, y_pred):
    u = K.mean(y_true, axis=-1)
    u = K.expand_dims(u, y_true.ndim - 1)
    u = K.repeat_elements(u, y_true.shape[-1], axis=-1)
    r2 = 1 - K.sum(K.square(y_pred - y_true), axis=-1) / K.sum(K.square(y_true - u), axis=-1)
    r = (K.sign(r2)*K.sqrt(K.abs(r2)))
    return K.clip(r, -1., 1.)

def loss_r2_1d(y_true, y_pred):
    u = K.mean(y_true)
    return K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))

def loss_r_score(y_true, y_pred):
    u = K.mean(y_true)
    r2 = 1 - K.sum(K.square(y_pred - y_true)) / K.sum(K.square(y_true - u))
    return (K.sign(r2)*K.sqrt(K.abs(r2)))
    #return K.clip(r, -1., 1.)

# For Submission

In [4]:
def data_generator(data_for_id_X, data_for_id_y, id_range, samples_back_included, batch_size):
    X = []
    y = []
    while True:
        for j in range(id_range):
            for i in range(samples_back_included, len(data_for_id_X[j])):
                X.append(data_for_id_X[j][max(i - samples_back_included, 0):i])
                y.append(data_for_id_y[j][i-1:i][0][0])
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X = []
                    y = []

def create_and_train_model(env, data, batch_size, epochs, passes, samples_back_included, dropout, first_layer_size, second_layer_size, first_layer_type, second_layer_type, optimizer):    
    ids = np.unique(data.id)
    train_samples = int(len(data) / batch_size / (epochs / passes)) * batch_size
    
    id_range = int(max(ids) - min(ids) + 1)
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    X_padding = np.zeros((samples_back_included - 1, len(X_columns)), dtype=np.float)
    y_padding = np.zeros((samples_back_included - 1, 1), dtype=np.float)

    data_for_id_X = [[] for i in range(id_range)]
    data_for_id_y = [[] for i in range(id_range)]

    for item_id in range(id_range):
        data_for_id_X[int(item_id)] = X_padding
        data_for_id_y[int(item_id)] = y_padding

    for item_id in ids:
        data_for_id_X[int(item_id)] = np.concatenate((X_padding, data[data.id == item_id][X_columns].values), axis=0)
        data_for_id_y[int(item_id)] = np.concatenate((y_padding, data[data.id == item_id][y_columns].values), axis=0)

    data = None
    gc.collect()
    
    model = Sequential()
    model.add(first_layer_type(first_layer_size, batch_input_shape=[batch_size, samples_back_included, len(X_columns)], return_sequences=True, stateful=True))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    model.add(Dropout(dropout))
    model.add(second_layer_type(second_layer_size, return_sequences=False, stateful=True))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss=loss_r2_1d,
                  optimizer=optimizer,
                  metrics=[loss_r_score])

    train_gen = data_generator(data_for_id_X, data_for_id_y, id_range, samples_back_included, batch_size)
    history = model.fit_generator(train_gen, samples_per_epoch=train_samples, nb_epoch=epochs, verbose=0)

    return data_for_id_X, model, history

targets = []
def test_model(env, observation, nan_distributions, means, stds, data_for_id_X, model, batch_size, low_y_cut, high_y_cut, samples_back_included):
    global targets
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    num_features = len(columns_to_use) + len(nan_distributions)
    X_padding = np.zeros((samples_back_included - 1, num_features), dtype=np.float)
    full_reward = 0
    while True:
        target = observation.target
        ids = observation.features['id'].values
        features = observation.features[columns_to_use]
        
        best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
        nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
        nan_features[np.arange(len(best_distributions)), best_distributions] = 1
        nan_features = pd.DataFrame(nan_features, columns=nan_columns)
        
        features = features.fillna(means)
        features = (features - means) / stds
        features = features.join(nan_features)
        for row_id, row in zip(ids, features.values):
            if row_id >= len(data_for_id_X):
                difference = row_id - len(data_for_id_X) + 1
                data_for_id_X.extend([X_padding] * difference)
            data_for_id_X[row_id] = np.concatenate((data_for_id_X[row_id], [row]), axis=0)

        X_to_predict = np.array([data_for_id_X[row_id][-samples_back_included:] for row_id in ids])

        # add extra rows to fit batch_size
        batches = math.ceil(len(X_to_predict) / batch_size)
        extra_predictions = batch_size * batches - len(X_to_predict)
        X_to_predict = np.concatenate((X_to_predict, np.zeros((extra_predictions, samples_back_included, num_features), dtype=np.int)), axis=0)

        target_size = len(target.y)
        target.y = model.predict(X_to_predict, batch_size=batch_size)[:target_size].clip(low_y_cut, high_y_cut)
        targets.append(target)
        observation, reward, done, info = env.step(target)
        if done:
            print("Finished, reward: ", info["public_score"])
            return info["public_score"]
        full_reward += reward
        if observation.features.timestamp[0] % 100 == 0:
            #print(full_reward / 100)
            full_reward = 0

In [5]:
results_array = np.array([]) #pd.read_csv('data/model_results').values
#pd.DataFrame(results_array).to_csv('data/model_results', index=False)

In [15]:
def test_params(args):
    global results_array
    start_time = time.time()
    results_dict_key = ', '.join('{0}: {1}'.format(key, args[key]) for key in args)
    default_args = {
        'batch_size': 256,
        'epochs': 4,
        'passes': 2,
        'low_y_cut': -0.075,
        'high_y_cut': 0.075,
        'samples_back_included': 8,
        'dropout': 0.35,
        'first_layer_size': 32,
        'second_layer_size': 64,
        'first_layer_type': GRU,
        'second_layer_type': GRU,
        'optimizer': 'adam'
    }
    env = kagglegym.make()
    observation = env.reset()
    data, nan_distributions, means, stds = add_nan_distributions_and_normalize(observation.train)
    
    train_args = dict((arg, args[arg] if arg in args else default_args[arg]) for arg in default_args if arg in ('batch_size', 'epochs', 'passes', 'samples_back_included', 'dropout', 'first_layer_size', 'second_layer_size', 'first_layer_type', 'second_layer_type', 'optimizer'))
    data_for_id_X, model, history = create_and_train_model(env, data, **train_args)

    test_args = dict((arg, args[arg] if arg in args else default_args[arg]) for arg in default_args if arg in ('batch_size', 'samples_back_included', 'low_y_cut', 'high_y_cut'))
    test_model(env, observation, nan_distributions, means, stds, data_for_id_X, model, **test_args)

In [16]:
%%time
test_params({
})
# -0.035

Finished, reward:  -0.0253341587343
CPU times: user 10min 15s, sys: 1min 36s, total: 11min 52s
Wall time: 11min 7s


In [17]:
%%time
test_params({
    'dropout': 0.4
})

KeyboardInterrupt: 

In [None]:
%%time
test_params({
    'first_layer_type': LSTM,
    'second_layer_type': LSTM,
    'dropout': 0.4
})

In [None]:
%%time
test_params({
    'first_layer_type': LSTM,
    'second_layer_type': LSTM,
    'dropout': 0.4,
    'optimizer': 'adagrad'
})

# TODOS
- get nan distributions in a way where total num is more constant and not iterative
- do state correctly (prime a bunch of states?)
- test optimizers/ltsm

# Benchmark

In [None]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

def run_benchmark():
    env = kagglegym.make()
    o = env.reset()
    excl = ['id', 'sample', 'y', 'timestamp']
    col = [c for c in o.train.columns if c not in excl]

    train = pd.read_hdf('data/train.h5')
    train = train[col]
    d_mean= train.median(axis=0)

    train = o.train[col]
    n = train.isnull().sum(axis=1)
    for c in train.columns:
        train[c + '_nan_'] = pd.isnull(train[c])
        d_mean[c + '_nan_'] = 0
    train = train.fillna(d_mean)
    train['znull'] = n
    n = []

    rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
    model1 = rfr.fit(train, o.train['y'])
    #print (model1.feature_importances_)
    train = []

    #https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
    low_y_cut = -0.075
    high_y_cut = 0.075
    # 0.075?
    y_is_above_cut = (o.train.y > high_y_cut)
    y_is_below_cut = (o.train.y < low_y_cut)
    y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
    model2 = LinearRegression(n_jobs=-1)
    model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])

    #https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
    ymean_dict = dict(o.train.groupby(["id"])["y"].median())

    full_reward = 0
    while True:
        test = o.features[col]
        n = test.isnull().sum(axis=1)
        for c in test.columns:
            test[c + '_nan_'] = pd.isnull(test[c])
        test = test.fillna(d_mean)
        test['znull'] = n
        pred = o.target
        test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
        pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
        pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
        pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
        o, reward, done, info = env.step(pred)
        if done:
            print("el fin ...", info["public_score"])
            break

        full_reward += reward
        if o.features.timestamp[0] % 100 == 0:
            print(full_reward / 100)
            full_reward = 0
    
run_benchmark()

In [22]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

def run_benchmark():
    env = kagglegym.make()
    o = env.reset()
    excl = ['id', 'sample', 'y', 'timestamp']
    col = [c for c in o.train.columns if c not in excl]

    train = pd.read_hdf('data/train.h5')
    train = train[col]
    d_mean= train.median(axis=0)

    train = o.train[col]
    
    
    
    features = data[columns_to_use]
    nan_distributions = get_nan_distributions(features)
    best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
    nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
    nan_features[np.arange(len(best_distributions)), best_distributions] = 1
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    nan_features = pd.DataFrame(nan_features, columns=nan_columns)
    print (len(nan_features))
    train = train.join(nan_features)
    
    #n = train.isnull().sum(axis=1)
    #for c in train.columns:
    #    train[c + '_nan_'] = pd.isnull(train[c])
    #    d_mean[c + '_nan_'] = 0
    train = train.fillna(d_mean)
    #train['znull'] = n
    #n = []

    rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
    model1 = rfr.fit(train, o.train['y'])
    print (model1.feature_importances_)
    train = []

    #https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
    low_y_cut = -0.075
    high_y_cut = 0.075
    # 0.075?
    y_is_above_cut = (o.train.y > high_y_cut)
    y_is_below_cut = (o.train.y < low_y_cut)
    y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
    model2 = LinearRegression(n_jobs=-1)
    model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])

    #https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
    ymean_dict = dict(o.train.groupby(["id"])["y"].median())

    full_reward = 0
    while True:
        test = o.features[col]
        
        best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
        nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
        nan_features[np.arange(len(best_distributions)), best_distributions] = 1
        nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
        nan_features = pd.DataFrame(nan_features, columns=nan_columns)
        test = test.join(nan_features)
        #n = test.isnull().sum(axis=1)
        #for c in test.columns:
        #    test[c + '_nan_'] = pd.isnull(test[c])
        test = test.fillna(d_mean)
        #test['znull'] = n
        pred = o.target
        test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
        pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
        pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
        pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
        o, reward, done, info = env.step(pred)
        if done:
            print("el fin ...", info["public_score"])
            break

        full_reward += reward
        if o.features.timestamp[0] % 100 == 0:
            print(full_reward / 100)
            full_reward = 0
    
run_benchmark()

KeyboardInterrupt: 

In [2]:
import kagglegym
import numpy as np
import math
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

def get_nan_distributions(features):
    missing_values = {}
    for row in features.values:
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    
    # TODO this sucks, fix it
    cutoff = 1000
    high_frequency_nan_distributions_filtered = []
    while len(high_frequency_nan_distributions_filtered) < 5:
        high_frequency_nan_distributions_filtered = []
        for dist, count in high_frequency_nan_distributions:
            new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
            if ((count**0.5) * (new**2)) > cutoff:
                high_frequency_nan_distributions_filtered.append(dist)
        cutoff -= 20
    
    return np.array(high_frequency_nan_distributions_filtered)

def add_nan_distributions_and_normalize_old(data):
    features = data.drop(['id', 'timestamp', 'y'], axis=1)
    nan_distributions = get_nan_distributions(features, data)
    best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
    nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
    nan_features[np.arange(len(best_distributions)), best_distributions] = 1
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    nan_features = pd.DataFrame(nan_features, columns=nan_columns)
    data = data.fillna(data.mean())
    
    means, stds = data.mean(), data.std()
    non_feature_columns = ['id', 'timestamp', 'y']
    for column in non_feature_columns:
        means[column] = 0
        stds[column] = 1
    data = (data - means) / stds
    new_data = data.join(nan_features)
    
    means = means.drop(non_feature_columns)
    stds = stds.drop(non_feature_columns)
    return new_data, nan_distributions, means, stds

env = kagglegym.make()
o = env.reset()
excl = ['id', 'sample', 'y', 'timestamp']
col = [c for c in o.train.columns if c not in excl]

train = pd.read_hdf('data/train.h5')
train = train[col]
d_mean= train.median(axis=0)

train = o.train[col]

columns_to_use = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19']

features = train[columns_to_use]
nan_distributions = get_nan_distributions(features)

In [3]:
nan_distributions

array([[ True,  True,  True,  True],
       [ True,  True, False,  True],
       [False, False, False, False],
       [False, False,  True,  True],
       [False, False, False,  True]], dtype=bool)

In [4]:
best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
nan_features[np.arange(len(best_distributions)), best_distributions] = 1
nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
nan_features = pd.DataFrame(nan_features, columns=nan_columns)
print (len(nan_features))
train = train.join(nan_features)

806298


In [None]:

#n = train.isnull().sum(axis=1)
#for c in train.columns:
#    train[c + '_nan_'] = pd.isnull(train[c])
#    d_mean[c + '_nan_'] = 0
train = train.fillna(d_mean)
#train['znull'] = n
#n = []

rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
model1 = rfr.fit(train, o.train['y'])
print (model1.feature_importances_)
train = []

#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
# 0.075?
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
model2 = LinearRegression(n_jobs=-1)
model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])

#https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
ymean_dict = dict(o.train.groupby(["id"])["y"].median())

full_reward = 0
while True:
    test = o.features[col]

    best_distributions = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in features.isnull().values], axis=1)
    nan_features = np.zeros((len(best_distributions), len(nan_distributions)))
    nan_features[np.arange(len(best_distributions)), best_distributions] = 1
    nan_columns = ['nan_{}'.format(i) for i in range(len(nan_distributions))]
    nan_features = pd.DataFrame(nan_features, columns=nan_columns)
    test = test.join(nan_features)
    #n = test.isnull().sum(axis=1)
    #for c in test.columns:
    #    test[c + '_nan_'] = pd.isnull(test[c])
    test = test.fillna(d_mean)
    #test['znull'] = n
    pred = o.target
    test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
    pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
    pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
    pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
    o, reward, done, info = env.step(pred)
    if done:
        print("el fin ...", info["public_score"])
        break

    full_reward += reward
    if o.features.timestamp[0] % 100 == 0:
        print(full_reward / 100)
        full_reward = 0

In [20]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

env = kagglegym.make()
o = env.reset()
excl = ['id', 'y', 'timestamp']
col = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19']
#[c for c in o.train.columns if c not in excl]

train = pd.read_hdf('data/train.h5')
train = train[col][:2000]
d_mean= train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in col:
    train[c + '_nan_'] = pd.isnull(train[c])
    
train = train.fillna(d_mean)

new_data = np.zeros((0, 3*len(col)))
y = []
last_for_id = {}
for data_id in np.unique(o.train.id):
    data_filter = (o.train.id == data_id)
    data_filtered = train[col][data_filter].values
    nan_filtered = train[[c + '_nan_' for c in col]][data_filter].values
    timestamps_filtered = o.train[data_filter].timestamp.values
    y.extend(o.train[data_filter].y.values)
    data_prime = np.subtract(data_filtered, np.concatenate(([list(d_mean)], data_filtered), axis=0)[:-1])
    last_for_id[data_id] = data_filtered[-1]
    new_data_for_id = np.concatenate((data_filtered, nan_filtered, data_prime), axis=1)
    new_data = np.concatenate((new_data, new_data_for_id), axis=0)
train = pd.DataFrame(new_data, columns=col + [c + '_nan_' for c in col] + [c + '_prime_' for c in col])

for c in col:
    d_mean[c + '_nan_'] = 0
    d_mean[c + '_prime_'] = 0

train['znull'] = n
n = []

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
np.subtract([0, 0, 0, 0], last_for_id[0])

array([ 0.        ,  0.        , -0.16955942,  0.32682818])

In [55]:
test = o.features[col]
n = test.isnull().sum(axis=1)
for c in test.columns:
    test[c + '_nan_'] = pd.isnull(test[c])
test = test.fillna(d_mean)

new_data = np.zeros((0, 4))
for row, row_id in zip(test[col].values, o.features.id.values):
    new_data_for_id = np.subtract(row, last_for_id[row_id])
    last_for_id[row_id] = row
    new_data = np.concatenate((new_data, [new_data_for_id]), axis=0)
new_data = pd.DataFrame(new_data, columns=[c + '_prime_' for c in col])
new_data.index = test.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [56]:
test.merge(new_data, left_index=True, right_index=True)

Unnamed: 0,technical_30,technical_20,fundamental_11,technical_19,technical_30_nan_,technical_20_nan_,fundamental_11_nan_,technical_19_nan_,technical_30_prime_,technical_20_prime_,fundamental_11_prime_,technical_19_prime_
0,0.000000,0.001180,0.167982,-0.313446,False,False,False,False,0.0,0.0,0.0,0.0
1,0.000000,0.000000,-0.618785,0.252598,False,False,False,False,0.0,0.0,0.0,0.0
2,0.000000,0.000000,-0.138245,-0.096262,False,False,False,False,0.0,0.0,0.0,0.0
3,0.004063,0.000000,0.039430,0.247675,False,False,False,False,0.0,0.0,0.0,0.0
4,0.000000,0.001400,-0.061658,-0.017085,False,False,True,False,0.0,0.0,0.0,0.0
5,0.000000,0.000000,-0.061658,0.244208,False,False,True,False,0.0,0.0,0.0,0.0
6,0.000000,0.005155,-0.699610,-1.622134,False,False,False,False,0.0,0.0,0.0,0.0
7,0.000000,0.001837,0.073283,0.018510,False,False,False,False,0.0,0.0,0.0,0.0
8,0.001300,0.000000,-0.018406,0.340568,False,False,False,False,0.0,0.0,0.0,0.0
9,0.000000,0.000000,-0.192703,0.026843,False,False,False,False,0.0,0.0,0.0,0.0


In [41]:
last_for_id[row_id]

[0.0, 0.00117996, 0.16798198, -0.31344575]