In [1]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import kagglegym

Using Theano backend.


In [2]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [3]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [4]:
def get_split_data(data):
    has_y = 'y' in data.columns
    data_to_use = data.drop('y', 1) if has_y else data
    best_distribution = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in data_to_use.isnull().values], axis=1)
    nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
    column_splits = [[column for column, included in zip(list(data_to_use.columns), distribution) if included] + (['y'] if has_y else []) for distribution in nan_distributions]
    return [data[split][columns] for split, distribution, columns in zip(nan_structures_split, nan_distributions, column_splits)]

In [5]:
def train_and_test_model(model_generator, data_split_further):
    clfs = []
    total_samples = 0
    total_score = 0.
    for X_train, y_train, X_test, y_test in data_split_further:
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        #print ('Trained on {0}, Tested on {1}'.format(len(X_train), len(X_test)))
        if len(X_test.values) > 0: 
            test_score = clf.score(X_test.values, y_test.values)
            train_score = clf.score(X_train.values, y_train.values)
            #print ('Test Score {0}, Training Score {1}'.format(test_score, train_score))
            total_samples += len(X_test)
            total_score += test_score * len(X_test)
        clfs.append(clf)
    #print ('Average Score ', total_score / total_samples)
    return (clfs, total_score / total_samples)

def train_model(model_generator, data_split):
    clfs = []
    for train in data_split:
        X_train = train.drop('y', axis=1).fillna(0)
        y_train = train['y']
        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        clfs.append(clf)
    return clfs

# For Testing

In [6]:
%%time
data = pd.read_hdf('data/train.h5')
cutoff = int(len(data) * 0.7)

CPU times: user 88.2 ms, sys: 726 ms, total: 815 ms
Wall time: 1.5 s


In [7]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
print ([(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions])

6
['                                                                                                              ', '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ', '        x   x                   x           x                  x   x                                          ', '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxxxxxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ', '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx', '  xxxxx x x x  x  x    x  x x xxxxx  xx  x  x x      x x  x x  xx  x x        x               x               ']
CPU times: user 37.3 s, sys: 860 ms, total: 38.1 s
Wall time: 41.7 s


In [8]:
def get_x_history(data):
    X = []
    y = []
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    for item_id in np.unique(data.id):
        data_for_id = data[data.id == item_id]
        data_for_id_X = data_for_id[X_columns].values
        data_for_id_y = data_for_id[y_columns].values
        for i in range(len(data_for_id)):
            X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
            y_to_use = data_for_id_y[max(i - samples_back_included - 1, 0):max(i - 1, 0)]
            if len(y_to_use) != 10 or len(X_to_use) != 10:
                X_to_use = np.concatenate((np.full((10 - len(X_to_use), len(X_columns)), np.nan), X_to_use), axis=0)
                y_to_use = np.concatenate((np.full((10 - len(y_to_use), len(y_columns)), np.nan), y_to_use), axis=0)
            X.append(np.concatenate((X_to_use, y_to_use), axis=1))
        y.extend(data_for_id.y.values)
    return X, np.array(y)

In [None]:
def get_x_history(data):
    X = []
    y = []
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    for item_id in np.unique(data.id):
        data_for_id = data[data.id == item_id]
        data_for_id_X = data_for_id[X_columns].values
        data_for_id_y = data_for_id[y_columns].values
        for i in range(len(data_for_id)):
            X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
            y_to_use = data_for_id_y[max(i - samples_back_included - 1, 0):max(i - 1, 0)]
            if len(y_to_use) != 10 or len(X_to_use) != 10:
                X_to_use = np.concatenate((np.full((10 - len(X_to_use), len(X_columns)), np.nan), X_to_use), axis=0)
                y_to_use = np.concatenate((np.full((10 - len(y_to_use), len(y_columns)), np.nan), y_to_use), axis=0)
            X.append(np.concatenate((X_to_use, y_to_use), axis=1))
        y.extend(data_for_id.y.values)
    return X, np.array(y)

In [34]:
samples_back_included = 10
num_features = 109 # length of X + 1 extra for y
batch_size = 32

In [93]:
def data_generator(data, timestamp_start, timestamp_end):
    X = []
    y = []
    X_columns = [item for item in data.columns if item not in ('id', 'timestamp', 'y')]
    y_columns = ['y']
    X_empty_row = data[X_columns].mean().values
    y_empty_row = data[y_columns].mean().values
    for item_id in np.unique(data.id):
        data_for_id = data[data.id == item_id]
        data_for_id_X = data_for_id[X_columns].values
        data_for_id_y = data_for_id[y_columns].values
        data_for_id_timestamp = data_for_id['timestamp'].values
        for i in range(len(data_for_id)):
            if data_for_id_timestamp[i] < timestamp_start or data_for_id_timestamp[i] >= timestamp_end:
                continue
            X_to_use = data_for_id_X[max(i - samples_back_included, 0):i]
            y_to_use = data_for_id_y[max(i - samples_back_included - 1, 0):max(i - 1, 0)]
            if len(X_to_use) != 10:
                X_to_use = np.concatenate((np.array([X_empty_row] * (10 - len(X_to_use))), X_to_use), axis=0)
            if len(y_to_use) != 10:
                y_to_use = np.concatenate((np.array([y_empty_row] * (10 - len(y_to_use))), y_to_use), axis=0)
            X.append(np.concatenate((X_to_use, y_to_use), axis=1))
            y.append(data_for_id_y[i])
            if len(X) == batch_size:
                yield (np.array(X), np.array(y))
                X = []
                y = []

In [144]:
from keras import backend as K
from sklearn.metrics import r2_score

u=np.mean(real_y.y)
dem=np.sum((real_y.y.values-u)**2)
r2=1-tot/dem
if r2<0:
    print(-np.sqrt(-r2))
else:
    print(np.sqrt(r2))


def r_score_old(y_true, y_pred, sample_weight=None, multioutput=None):
    r2 = r2_score(y_true, y_pred, sample_weight=sample_weight,
                  multioutput=multioutput)
    r = (np.sign(r2)*np.sqrt(np.abs(r2)))
    if r <= -1:
        return -1
    else:
        return r
    
def r_score(y_true, y_pred):
    

In [103]:
model = Sequential()
model.add(LSTM(128, batch_input_shape=[batch_size, samples_back_included, num_features], stateful=True))
model.add(BatchNormalization())
model.add(Activation('softsign'))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(loss=r_score,
              optimizer='adam',
              metrics=['accuracy'])

In [65]:
%%time
data = data.fillna(data.mean())
#test_cutoff = int(len(data) / batch_size) * batch_size

CPU times: user 523 ms, sys: 421 ms, total: 944 ms
Wall time: 979 ms


In [136]:
train_bounds = (0, 1400)
test_bounds = (1400, 1812) # 1812
epochs = 8
train_samples = int(len(data[(data.timestamp >= train_bounds[0]) & (data.timestamp < train_bounds[1])]) / batch_size / epochs) * batch_size - batch_size
test_samples = int(len(data[(data.timestamp >= test_bounds[0]) & (data.timestamp < test_bounds[1])]) / batch_size) * batch_size - batch_size

#model.fit_generator(data_generator(data, train_bounds[0], train_bounds[1]), samples_per_epoch=train_samples, nb_epoch=epochs, verbose=1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x15158a898>

In [143]:
model.evaluate_generator(data_generator(data, test_bounds[0], test_bounds[1]), val_samples=13374)

[0.00076241167835395781, 0.0]

In [142]:
train_samples

160160

In [34]:
%%time
data_split_train = get_split_data(data[:cutoff])
data_split_test = get_split_data(data[cutoff:])
data_split_further = [[data_train.drop('y', axis=1), data_train[['id', 'timestamp', 'y']], data_test.drop('y', axis=1), data_test[]] for data_train, data_test in zip(data_split_train, data_split_test)]

CPU times: user 2min 4s, sys: 4.3 s, total: 2min 8s
Wall time: 2min 11s


In [None]:
%%time
et_clfs, score = train_and_test_model(lambda: ExtraTreesRegressor(n_estimators=25, max_depth=4, random_state=17, verbose=0), data_split_further)

In [71]:
X_train, y_train, X_test, y_test = data_split_further[4]
#X_train = X_train.fillna(0)
#scaler = MinMaxScaler(feature_range=(0, 1))
#X_train = scaler.fit_transform(X_train)
#joblib.dump(scaler, 'data/models/scaler_{}.pkl'.format(0))

#X_test = X_test.fillna(0)
#X_test = scaler.transform(X_test)

In [72]:
samples_back_included = 10
num_features = 111 # length of X + 1 extra for y
batch_size = 32

In [125]:
id_wanted = 10
timestamp = 5
X_to_use = X_train[(X_train.id == 10) & (X_train.timestamp < timestamp) & (X_train.timestamp > timestamp - samples_back_included)]
y_to_use = y_train[(y_train.id == 10) & (y_train.timestamp < timestamp - 1) & (y_train.timestamp > timestamp - samples_back_included - 1)]

#y_train[(y_train.id == 10) & (y_train.timestamp < 25) & (y_train.timestamp > 20)]

In [155]:
X_merged = pd.DataFrame(columns=columns)
y_merged = pd.DataFrame(columns=['y', 'timestamp'])
for X_train, y_train, X_test, y_test in data_split_further:
    X_for_id = X_train[(X_train.id == 10) & (X_train.timestamp < timestamp) & (X_train.timestamp > timestamp - samples_back_included)]
    y_for_id = y_train[(y_train.id == 10) & (y_train.timestamp < timestamp - 1) & (y_train.timestamp > timestamp - samples_back_included - 1)]
    X_merged = X_merged.append(X_for_id)
    y_merged = y_merged.append(y_for_id)

X_merged = X_merged.sort_values(['timestamp'], ascending=[True])
y_merged = y_merged.sort_values(['timestamp'], ascending=[True])

X_merged = pd.DataFrame(np.full((10 - len(X_merged), len(columns)), np.nan)).append(X_merged)
y_merged = pd.DataFrame(np.full((10 - len(y_merged), 2), np.nan)).append(y_merged)
X_merged.y = y_merged['y'].values
X_merged

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,technical_41,technical_42,technical_43,technical_44,technical_5,technical_6,technical_7,technical_9,timestamp,y
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,0.0,
750,,,,,,,,,,,...,,,,,,,,,1.0,-0.011753
1500,,,,,,,,,,,...,,,,,,,,,2.0,0.00585
2250,,,,,,,,,,,...,,,,,,,,,3.0,-0.000476
3000,,,,,,,,,,,...,,,,,,,,,4.0,0.005212


# For Submission

In [41]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train

CPU times: user 426 ms, sys: 2.84 s, total: 3.26 s
Wall time: 6.12 s


In [70]:
%%time
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data)
nan_distributions = filter_nan_distributions(500000, high_frequency_nan_distributions)
data_split = get_split_data(data)
clfs = train_model(lambda: ExtraTreesRegressor(n_estimators=25, max_depth=4, random_state=17, verbose=0), data_split)

CPU times: user 1min 14s, sys: 1.1 s, total: 1min 15s
Wall time: 1min 16s


In [90]:
%%time
n = 0
rewards = []
while True:
    target = observation.target
    features_split = get_split_data(observation.features)
    for X, clf in zip(features_split, clfs):
        X = X.fillna(0)
        y = clf.predict(X)
        for result_id, result in zip(X.id.values, y):
            target.loc[observation.target.id == result_id, 'y'] = result
    observation, reward, done, info = env.step(target)
    if done:
        break
    print (reward)
    rewards.append(reward)
    n = n + 1

print(info)
print(n)

-0.11128681189
0.0825276854025
-0.0548432430466
-0.15183885798
-0.0923139783666
-0.0825648098422
-0.107279092471
-0.0520323144763
-0.360260755881
-0.197959841749
-0.0816455593487
-0.133597131291
-0.138718911415
0.0298962237767
-0.12717418969
-0.114115171981
-0.0943293327787
-0.0396678680466


KeyboardInterrupt: 