In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.metrics import log_loss

from models import make_voting_ensemble
from predict import preprocess_data
from os.path import join as pj
from tqdm import tqdm_notebook
from copy import deepcopy
import pandas as pd

target = 'charles'
datadir = '/Volumes/4TB/numerai/119/'
train_df = pd.read_csv(pj(datadir, 'numerai_training_data.csv'))
tourn_df = pd.read_csv(pj(datadir, 'numerai_tournament_data.csv'))
(X, y, eras), X_test = preprocess_data(train_df, tourn_df, target=target)
ueras = unique(eras)
K = len(ueras)
era_masks = {era: eras==era for era in ueras}

In [None]:
Cs = geomspace(1e-3, 1e3, 2)
parallel = joblib.Parallel(n_jobs=-1, verbose=0)

def logistic_whitened(C=1):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(
            C=C, tol=1e-4, max_iter=1000, warm_start=True))])

def train_and_predict_logits(train_era):
    model = logistic_whitened()
    train_mask = era_masks[train_era]
    X_train, y_train = X[train_mask], y[train_mask]
    logits = empty((len(X), len(Cs)))
    models = []
    for j, C in enumerate(Cs):
        model.set_params(logistic__C=C)
        model.fit(X_train, y_train)
        models.append(deepcopy(model))
        logits_j = model.decision_function(X)
        logits_j[train_mask] = 0
        logits[:, j] = logits_j
    return logits, models

logits, models = tuple(map(array, list(zip(*parallel(
    joblib.delayed(train_and_predict_logits)(train_era)
        for train_era in tqdm_notebook(ueras))))))

In [None]:
import tensorflow as tf
from scipy.special import logit as logit_fn
tf.reset_default_graph()

l_init = tf.placeholder(shape=logits.shape, dtype=tf.float32)
l = tf.Variable(l_init)

t_init = tf.placeholder(shape=y.shape, dtype=tf.bool)
t = tf.Variable(t_init)

w_init = tf.placeholder(shape=(K, len(Cs)), dtype=tf.float32)
w = tf.Variable(tf.expand_dims(w_init, 1))

weighted_logits = l * w
mean_over_eras = tf.reduce_sum(weighted_logits, axis=0) / (K - 1)
mean_over_Cs = tf.reduce_mean(mean_over_eras, axis=1)
output = tf.clip_by_value(mean_over_Cs, logit_fn(.3), logit_fn(.7))

era_inds = [where(era_masks[era])[0] for era in ueras]
era_xents = tf.convert_to_tensor([tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
    labels=tf.cast(tf.gather(t, i), tf.float32), logits=tf.gather(output, i))) for i in era_inds])

mean_xent, var_xent = tf.nn.moments(era_xents, axes=[0])
var_weight = tf.placeholder(shape=[], dtype=tf.float32)
loss = mean_xent + tf.Variable(var_weight) * var_xent

learning_rate = tf.placeholder(shape=[], dtype=tf.float32)
opt = tf.train.AdamOptimizer(tf.Variable(learning_rate)).minimize(loss, var_list=[w])

In [None]:
def verify_numpy(w_sol):
    weighted_logits = logits * w_sol
    mean_over_eras = np.sum(weighted_logits, axis=0) / (K - 1)
    mean_over_Cs = np.mean(mean_over_eras, axis=1)
    output = np.clip(mean_over_Cs, logit_fn(.3), logit_fn(.7))
    return log_loss(y, 1/(1+exp(-output)))


if not locked:
    locked = True
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer(), feed_dict={
            l_init: logits, 
            t_init: y, 
            w_init: ones((K, len(Cs))),
            learning_rate: 0.3,
            var_weight: 1,
        })
        ws, losses, mus, sigs = [None], [], [], []
        for iteration in tqdm_notebook(range(10000)):
            l, w_hat, _, mu, sig = sess.run([loss, w, opt, mean_xent, var_xent])
            ws.append(w_hat)
            losses.append(l)
            mus.append(mu)
            sigs.append(sig)
            w_sol = ws[argmin(losses)]
            if not iteration % 10:
                amin = argmin(losses)
                print('{:.6f} +/- {:.6f}'.format(mus[amin], sigs[amin]))

In [None]:
plot(losses)

In [None]:
figure(figsize=(7, 14))
imshow(w_show[:, 0], aspect='auto', cmap='RdBu_r')
colorbar()

In [None]:
verify_numpy(w_sol)

In [None]:
logits_test = empty((K, len(X_test), len(Cs)))
for i in tqdm_notebook(range(K)):
    for j in range(len(Cs)):
        logits_test[i, :, j] = models[i, j].decision_function(X_test)

In [None]:
logits_test_weighted = logits_test * w_sol
mean_over_eras = np.sum(logits_test_weighted, axis=0) / (K - 1)
mean_over_Cs = np.mean(mean_over_eras, axis=1)
test_output = 1 / (1 + exp(-np.clip(mean_over_Cs, logit_fn(.3), logit_fn(.7))))

In [None]:
from predict import postprocess_results
results = postprocess_results(tourn_df['id'], test_output, target)
savetxt('/Volumes/4TB/numerai/119/predictions_{}_voting_gradientDescent.csv'.format(target), 
        results, delimiter=',', fmt='%s')