In [1]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
data = pd.read_hdf('data/train.h5')

In [3]:
ids = np.unique(data.id.values)
np.random.shuffle(ids)
cutoff = int(len(ids) * 0.8)
train_ids = ids[:cutoff]
test_ids = ids[cutoff:]

In [4]:
%%time
train = data[[row_id in train_ids for row_id in data.id]]
test = data[[row_id in test_ids for row_id in data.id]]

In [5]:
%%time
X_train = train.drop('y', axis=1)
X_test = test.drop('y', axis=1)

In [6]:
%%time
y_train = train['y']
y_test = test['y']

In [8]:
%%time
missing_values = {}
for row in X_test.values:
    key = tuple([not math.isnan(val) for val in row])
    if key in missing_values:
        missing_values[key] += 1
    else:
        missing_values[key] = 1
        
high_frequency_nan_distributions = sorted([(key, missing_values[key]) for key in missing_values], key=lambda key_value: key_value[1])[-500:]
high_frequency_nan_distributions.reverse()

CPU times: user 10.2 s, sys: 118 ms, total: 10.3 s
Wall time: 10.4 s


In [9]:
def filter_nan_distributions(cutoff):
    high_frequency_nan_distributions_filtered = []
    for dist, count in high_frequency_nan_distributions:
        new = min([sum(int(a != b) for a, b in zip(dist, dist2)) for dist2 in high_frequency_nan_distributions_filtered] + [len(dist)])
        if new == -1 or (count * new * new) > cutoff:
            high_frequency_nan_distributions_filtered.append(dist)
    return high_frequency_nan_distributions_filtered

In [10]:
# TODO dont use 40000, select number based on wanting 5 buckets
nan_distributions = filter_nan_distributions(300000)

In [11]:
[(''.join([' ' if val else 'x' for val in row])) for row in nan_distributions]

['                                                                                                              ',
 '    x x  xxx  xx x xxxx     xxx   xxxx  xx x x   xx xx xxx  xxx   x                                           ',
 '        x  xx                   x           x                  x   x                                          ',
 '        x   x     x       x     x x   x x   x x          xx    xx  x                                          ',
 '  xx xx x x x  x       x      xxxxx  x   x  x        x         x   x x        x               x               ',
 '  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxx xxxxxxxxxx']

In [12]:
for row in X_test.values[60000:60002]:
    print ([sum(int(math.isnan(val) == b) + 5 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions])  
    print ([sum(int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions])  
    print ('\'' + ''.join(['x' if math.isnan(val) else ' ' for val in row]) + '\'') 

[0, 34, 7, 15, 23, 106]
[0, 0, 0, 0, 0, 0]
'                                                                                                              '
[252, 48, 210, 197, 170, 64]
[42, 8, 35, 32, 27, 0]
'    x x xxxxx xx x xxxx     xxx x xxxx  xx xxxx  xx xx xxx  xxxx  xx x                                        '


In [38]:
%%time
best_distribution = [np.argmin([sum(int(not math.isnan(val) and not b) + 6 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions]) for row in X_train.values]
nan_structures_split = [[best == i for best in best_distribution] for i in range(len(nan_distributions))]
X_train_split = [X_train[split][[column for column, included in zip(list(X_train.columns), distribution) if included]] for split, distribution in zip(nan_structures_split, nan_distributions)]
y_train_split = [y_train[split] for split in nan_structures_split]

CPU times: user 12min 53s, sys: 14.6 s, total: 13min 8s
Wall time: 14min 13s


In [41]:
%%time
best_distribution_test = [np.argmin([sum(int(not math.isnan(val) and not b) + 6 * int(math.isnan(val) and b) for val, b in zip(row, dist)) for dist in nan_distributions]) for row in X_test.values]
nan_structures_split_test = [[best == i for best in best_distribution_test] for i in range(len(nan_distributions))]
X_test_split = [X_test[split][[column for column, included in zip(list(X_test.columns), distribution) if included]] for split, distribution in zip(nan_structures_split_test, nan_distributions)]
y_test_split = [y_test[split] for split in nan_structures_split_test]

CPU times: user 3min 1s, sys: 3.09 s, total: 3min 4s
Wall time: 3min 7s


In [None]:
et_clfs = []
for X_train_section, y_train_section, X_test_section, y_test_section in zip(X_train_split, y_train_split, X_test_split, y_test_split):
    X_train_section = X_train_section.fillna(0)
    y_train_section = y_train_section.fillna(0)
    X_test_section = X_test_section.fillna(0)
    y_test_section = y_test_section.fillna(0)
    
    et_clf = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
    et_clf.fit(X_train_section.values, y_train_section.values)
    print (et_clf.score(X_test_section.values, y_test_section.values))
    et_clfs.append(et_clf)

0.000642840945508
0.00269348490823
0.00121844591305
0.000747947190825


In [283]:
et_clfs = []
for X_train_section, y_train_section, X_test_section, y_test_section in zip(X_train_split, y_train_split, X_test_split, y_test_split):
    X_train_section = X_train_section.fillna(0)
    y_train_section = y_train_section.fillna(0)
    X_test_section = X_test_section.fillna(0)
    y_test_section = y_test_section.fillna(0)
    
    et_clf = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
    et_clf.fit(X_train_section.values, y_train_section.values)
    print (et_clf.score(X_test_section.values, y_test_section.values)
    et_clfs.append(et_clf)

0.000457960105077
0.00400984788687
0.00126654608904
-0.000874518146675
-0.00102005127667
0.0012806125815
-0.00015954796115
-0.0255859738641
-0.000187927050856


In [None]:
rf_clfs = []
for X_train_section, y_train_section, X_test_section, y_test_section in zip(X_train_split, y_train_split, X_test_split, y_test_split):
    X_train_section = X_train_section.fillna(0)
    y_train_section = y_train_section.fillna(0)
    X_test_section = X_test_section.fillna(0)
    y_test_section = y_test_section.fillna(0)
    
    clf = RandomForestRegressor()
    clf.fit(X_train_section.values, y_train_section.values)
    print (len(X_train_section))
    print (len(X_test_section))
    print (clf.score(X_test_section.values, y_test_section.values))
    rf_clfs.append(clf)

In [None]:
xgb_clfs = []
for X_train_section, y_train_section, X_test_section, y_test_section in zip(X_train_split, y_train_split, X_test_split, y_test_split):
    X_train_section = X_train_section.fillna(0)
    y_train_section = y_train_section.fillna(0)
    X_test_section = X_test_section.fillna(0)
    y_test_section = y_test_section.fillna(0)
    
    clf = XGBClassifier()
    clf.fit(X_train_section.values, y_train_section.values)
    print (len(X_train_section))
    print (len(X_test_section))
    print (clf.score(X_test_section.values, y_test_section.values))
    xgb_clfs.append(clf)

In [None]:
et_clf = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
et_clf.fit(X_train.values, y_train.values)
et_clf.score(X_test.values, y_test.values)