In [4]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import kagglegym



In [41]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train
data_length = 110

CPU times: user 426 ms, sys: 2.84 s, total: 3.26 s
Wall time: 6.12 s


In [53]:
def get_high_frequency_nan_distributions():
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [54]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [44]:
%%time
# TODO dont use 40000, select number based on wanting 5 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions()
nan_distributions = filter_nan_distributions(500000, high_frequency_nan_distributions)
# [(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions]

CPU times: user 28.6 s, sys: 905 ms, total: 29.5 s
Wall time: 31.5 s


In [69]:
def get_split_data(data):
    has_y = 'y' in data.columns
    data_to_use = data.drop('y', 1) if has_y else data
    best_distribution = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in data_to_use.isnull().values], axis=1)
    nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
    column_splits = [[column for column, included in zip(list(data_to_use.columns), distribution) if included] + (['y'] if has_y else []) for distribution in nan_distributions]
    return [data[split][columns] for split, distribution, columns in zip(nan_structures_split, nan_distributions, column_splits)]

In [70]:
%%time
data_split = get_split_data(data)

CPU times: user 1min 14s, sys: 1.1 s, total: 1min 15s
Wall time: 1min 16s


In [80]:
def train_and_test_model(model_generator):
    clfs = []
    total_samples = 0
    total_score = 0.
    for X_train, y_train, X_test, y_test in data_split_further:
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        print ('Trained on {0}, Tested on {1}'.format(len(X_train), len(X_test)))
        score = clf.score(X_test.values, y_test.values)
        print ('Score ', score)
        total_samples += len(X_test)
        total_score += score * len(X_test)
        clfs.append(clf)
    print ('Average Score ', total_score / total_samples)
    return clfs

def train_model(model_generator, data_split):
    clfs = []
    for train in data_split:
        X_train = train.drop('y', axis=1).fillna(0)
        y_train = train['y']
        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        clfs.append(clf)
    return clfs

In [81]:
%%time
et_clfs = train_model(lambda: ExtraTreesRegressor(n_estimators=25, max_depth=4, random_state=17, verbose=0), data_split)

CPU times: user 59.8 s, sys: 804 ms, total: 1min
Wall time: 1min 2s


In [None]:
n = 0
rewards = []
while True:
    target = observation.target
    features_split = get_split_data(observation.features)
    for X, clf in zip(features_split, et_clfs):
        X = X.fillna(0)
        y = clf.predict(X)
        for result_id, result in zip(X.id.values, y):
            target.loc[observation.target.id == result_id, 'y'] = result
    observation, reward, done, info = env.step(target)
    if done:
        break
    rewards.append(reward)
    n = n + 1

print(info)
print(n)
print(rewards[0:15])

In [13]:
%%time
xgb_clfs = train_and_test_model(lambda: XGBRegressor())

234871
66464
0.0230805670891
465897
106243
0.00475901717853
346164
83394
0.00891305878331
234260
72350
-0.191136060595
41236
5302
-0.0571993853776
42587
11988
0.0341132880557
CPU times: user 19min 3s, sys: 9.3 s, total: 19min 13s
Wall time: 5min 46s


In [None]:
%%time
data_split_further = []
for data in data_split:
    train = data[[row_id in train_ids for row_id in data.id]]
    test = data[[row_id in test_ids for row_id in data.id]]
    X_train = train.drop('y', axis=1)
    X_test = test.drop('y', axis=1)
    y_train = train['y']
    y_test = test['y']
    data_split_further.append([X_train, y_train, X_test, y_test])