In [1]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import kagglegym



In [2]:
def get_high_frequency_nan_distributions(data):
    missing_values = {}
    for row, row_id in zip(data.drop('y', 1).values, data['id'].values):
        key = tuple([not math.isnan(val) for val in row])
        if key in missing_values:
            missing_values[key] += 1
        else:
            missing_values[key] = 1

    high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
    high_frequency_nan_distributions.reverse()
    return high_frequency_nan_distributions

In [3]:
def filter_nan_distributions(cutoff, high_frequency_nan_distributions):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return np.array(high_frequency_nan_distributions_filtered)

In [4]:
def get_split_data(data):
    has_y = 'y' in data.columns
    data_to_use = data.drop('y', 1) if has_y else data
    best_distribution = np.argmin([[6 * np.sum(np.logical_and(row, dist)) - np.sum(np.logical_or(row, dist)) for dist in nan_distributions] for row in data_to_use.isnull().values], axis=1)
    nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
    column_splits = [[column for column, included in zip(list(data_to_use.columns), distribution) if included] + (['y'] if has_y else []) for distribution in nan_distributions]
    return [data[split][columns] for split, distribution, columns in zip(nan_structures_split, nan_distributions, column_splits)]

In [5]:
def train_and_test_model(model_generator, data_split_further):
    clfs = []
    total_samples = 0
    total_score = 0.
    for X_train, y_train, X_test, y_test in data_split_further:
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)

        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        #print ('Trained on {0}, Tested on {1}'.format(len(X_train), len(X_test)))
        if len(X_test.values) > 0: 
            test_score = clf.score(X_test.values, y_test.values)
            train_score = clf.score(X_train.values, y_train.values)
            #print ('Test Score {0}, Training Score {1}'.format(test_score, train_score))
            total_samples += len(X_test)
            total_score += test_score * len(X_test)
        clfs.append(clf)
    #print ('Average Score ', total_score / total_samples)
    return (clfs, total_score / total_samples)

def train_model(model_generator, data_split):
    clfs = []
    for train in data_split:
        X_train = train.drop('y', axis=1).fillna(0)
        y_train = train['y']
        clf = model_generator()
        clf.fit(X_train.values, y_train.values)
        clfs.append(clf)
    return clfs

# For Testing

In [8]:
%%time
data = pd.read_hdf('data/train.h5')
cutoff = int(len(data) * 0.7)

CPU times: user 8.68 ms, sys: 559 ms, total: 568 ms
Wall time: 566 ms


In [12]:
%%time
# TODO dont use 40000, select number based on wanting 6 buckets
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data[:cutoff])
nan_distributions = filter_nan_distributions(800000, high_frequency_nan_distributions)
print (len(nan_distributions))
[(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions]

6
CPU times: user 35.2 s, sys: 663 ms, total: 35.9 s
Wall time: 36.2 s


In [19]:
%%time
data_split_train = get_split_data(data[:cutoff])
data_split_test = get_split_data(data[cutoff:])
data_split_further = [[data_train.drop('y', axis=1), data_train['y'], data_test.drop('y', axis=1), data_test['y']] for data_train, data_test in zip(data_split_train, data_split_test)]

CPU times: user 2min 13s, sys: 2.59 s, total: 2min 16s
Wall time: 2min 17s


In [10]:
%%time
et_clfs, score = train_and_test_model(lambda: ExtraTreesRegressor(n_estimators=25, max_depth=4, random_state=17, verbose=0), data_split_further)

Trained on 406104, Tested on 103212
Score  -0.000125896235152
Trained on 236201, Tested on 58994
Score  0.0038218333494
Trained on 293195, Tested on 83195
Score  0.00105752958045
Trained on 126163, Tested on 23253
Score  0.00154363319532
Trained on 27017, Tested on 5919
Score  -0.000888224471894
Trained on 41121, Tested on 10116
Score  0.000695710439493
Trained on 31691, Tested on 8800
Score  0.00163656553817
Trained on 153871, Tested on 31760
Score  0.000227875330331
Trained on 56250, Tested on 13894
Score  0.00156787962962
Average Score  0.0011250420427
CPU times: user 1min 40s, sys: 894 ms, total: 1min 41s
Wall time: 1min 41s


# For Submission

In [41]:
%%time
env = kagglegym.make()
observation = env.reset()
data = observation.train

CPU times: user 426 ms, sys: 2.84 s, total: 3.26 s
Wall time: 6.12 s


In [70]:
%%time
high_frequency_nan_distributions = get_high_frequency_nan_distributions(data)
nan_distributions = filter_nan_distributions(500000, high_frequency_nan_distributions)
data_split = get_split_data(data)
clfs = train_model(lambda: ExtraTreesRegressor(n_estimators=25, max_depth=4, random_state=17, verbose=0), data_split)

CPU times: user 1min 14s, sys: 1.1 s, total: 1min 15s
Wall time: 1min 16s


In [90]:
%%time
n = 0
rewards = []
while True:
    target = observation.target
    features_split = get_split_data(observation.features)
    for X, clf in zip(features_split, clfs):
        X = X.fillna(0)
        y = clf.predict(X)
        for result_id, result in zip(X.id.values, y):
            target.loc[observation.target.id == result_id, 'y'] = result
    observation, reward, done, info = env.step(target)
    if done:
        break
    print (reward)
    rewards.append(reward)
    n = n + 1

print(info)
print(n)

-0.11128681189
0.0825276854025
-0.0548432430466
-0.15183885798
-0.0923139783666
-0.0825648098422
-0.107279092471
-0.0520323144763
-0.360260755881
-0.197959841749
-0.0816455593487
-0.133597131291
-0.138718911415
0.0298962237767
-0.12717418969
-0.114115171981
-0.0943293327787
-0.0396678680466


KeyboardInterrupt: 