In [22]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
data = pd.read_hdf('data/train.h5')

In [3]:
ids = np.unique(data.id.values)
np.random.shuffle(ids)
cutoff = int(len(ids) * 0.8)
train_ids = ids[:cutoff]
test_ids = ids[cutoff:]

In [4]:
train = data[[row_id in train_ids for row_id in data.id]]
test = data[[row_id in test_ids for row_id in data.id]]

In [5]:
X_train = train.drop('y', axis=1)
X_test = test.drop('y', axis=1)

In [6]:
y_train = train['y']
y_test = test['y']

In [173]:
missing_values = {}
for row in X_test.values:
    key = tuple([not math.isnan(val) for val in row])
    if key in missing_values:
        missing_values[key] += 1
    else:
        missing_values[key] = 1
        
high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
high_frequency_nan_distributions.reverse()

In [254]:
def filter_nan_distributions(cutoff):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if new == -1 or (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return high_frequency_nan_distributions_filtered

In [255]:
# TODO dont use 40000, select number based on wanting 5 buckets
nan_distributions = filter_nan_distributions(300000)

In [256]:
[(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions]

['                                                                                                              ',
 '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ',
 '        x  xx                   x           x                  x   x                                          ',
 '    x    xxx     x          x x   x  x   x             x x   xx                                               ',
 '  xxxxx xxxxxxxx xxxxxxxxxx xxxxxxxxxxx xxxxxxx  xxxxxxxxxx xxxxxxxx x                                        ',
 '        x x x  x  x       x   xxxxx  xx xx  x x      x   xx    xx  x x                                        ',
 '  xxxxx x x x  x              xxxxx  x   x  x        x      x  x   x x        x                               ',
 '        x x x     x    x  x x   xxx  xx xx  x x        x xx    xx  x xx x  x xxxx xxxx        x  x    xxx  xx ',
 '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [None]:
for row in X_test.values[60000:60002]:
    print ([sum(int(math.isnan(val) == b) + 5 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions])  
    print ([sum(int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions])  
    print ('\'' + ''.join(['x' if math.isnan(val) else ' ' for val in row]) + '\'') 

In [260]:
best_distribution = [np.argmin([sum(int(not math.isnan(val) and not b) + 6 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions]) for row in X_train.values]
nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
X_train_split = [X_train[split] for split in nan_structures_split]
y_train_split = [y_train[split] for split in nan_structures_split]

[6, 40, 13, 20, 59, 23, 22, 40, 105]
[1, 1, 1, 1, 0, 0, 0, 0, 0]
'                                                                     x                                        '
[246, 224, 211, 211, 145, 130, 178, 0, 65]
[41, 33, 35, 34, 18, 21, 28, 0, 0]
'        x x x     x    x  x x   xxx  xx xx  x x        x xx    xx  x xx x  x xxxx xxxx        x  x    xxx  xx '
[0, 34, 7, 14, 60, 24, 23, 41, 106]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
'                                                                                                              '
[0, 34, 7, 14, 60, 24, 23, 41, 106]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
'                                                                                                              '
[0, 34, 7, 14, 60, 24, 23, 41, 106]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
'                                                                                                              '


In [None]:
best_distribution_test = [np.argmin([sum(int(not math.isnan(val) and not b) + 6 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions]) for row in X_test.values]
nan_structures_split_test = [[best == i for best in best_distribution_test] for i in range(len(nan_distributions))]
X_test_split = [X_test[split] for split in nan_structures_split_test]
y_test_split = [y_test[split] for split in nan_structures_split_test]

In [None]:
clf = RandomForestRegressor()
clf.fit(X_train.values, y_train.values)
clf.score(X_test.values, y_test.values)

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train.values[:20000], y_train.values[:20000])
xgb_clf.score(X_test.values[:20000], y_test.values[:20000])

In [None]:
et_clf = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
et_clf.fit(X_train.values, y_train.values)
et_clf.score(X_test.values, y_test.values)

In [None]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

env = kagglegym.make()
o = env.reset()
excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
col = [c for c in o.train.columns if c not in excl]

train = pd.read_hdf('../input/train.h5')
train = train[col]
d_mean= train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in train.columns:
    train[c + '_nan_'] = pd.isnull(train[c])
    d_mean[c + '_nan_'] = 0
train = train.fillna(d_mean)
train['znull'] = n
n = []

rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
model1 = rfr.fit(train, o.train['y'])

#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)
model2 = LinearRegression(n_jobs=-1)
model2.fit(np.array(o.train[col].fillna(d_mean).loc[y_is_within_cut, 'technical_20'].values).reshape(-1,1), o.train.loc[y_is_within_cut, 'y'])
train = []

#https://www.kaggle.com/ymcdull/two-sigma-financial-modeling/ridge-lb-0-0100659
ymean_dict = dict(o.train.groupby(["id"])["y"].median())

while True:
    test = o.features[col]
    n = test.isnull().sum(axis=1)
    for c in test.columns:
        test[c + '_nan_'] = pd.isnull(test[c])
    test = test.fillna(d_mean)
    test['znull'] = n
    pred = o.target
    test2 = np.array(o.features[col].fillna(d_mean)['technical_20'].values).reshape(-1,1)
    pred['y'] = (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.65) + (model2.predict(test2).clip(low_y_cut, high_y_cut) * 0.35)
    pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
    pred['y'] = [float(format(x, '.6f')) for x in pred['y']]
    o, reward, done, info = env.step(pred)
    if done:
        print("el fin ...", info["public_score"])
        break
    if o.features.timestamp[0] % 100 == 0:
        print(reward)