# RNN Model Production and Exploration Notebook

This notebook is an attempt to build some DRNN structure to predict market movements on TWOSIGMA Financial Modeling dataset.
### This is a work in progress!

In [None]:
# import modules
import gc
gc.collect()
import kagglegym
import numpy as np
np.random.seed(42)
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.regularizers import l2
from keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing as pp
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

%matplotlib inline

In [None]:
# Start environment
env = kagglegym.make()
observation = env.reset()
train = observation.train

Lets print our dataset head to see how it looks like:

In [None]:
observation.train.head()

Lets preprocess the data filling the NaNs values with the median of that columns values, as seen so many times on the community work. Also, lets clip our features to use between the min and max values of the target variable in an attempt to deal with the outliers, and scale the data in order to help the neuronnet to learn.

In [None]:
# Data preprocessing

# https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189/code
# Clipped target value range to use
low_y_cut = -0.086093
high_y_cut = 0.093497

y_is_above_cut = (train.y > high_y_cut)
y_is_below_cut = (train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

# Select the features to use
excl = ['id', 'sample', 'y', 'timestamp']
#feature_vars = [c for c in train.columns if c not in excl]
features_to_use =  ['technical_20', 'technical_30', 'fundamental_11',
                    'fundamental_37', 'technical_35', 'technical_36', 'fundamental_36']
target_var = ['y']

features = train.loc[y_is_within_cut, features_to_use]
X_train = features.values

targets = train.loc[y_is_within_cut, target_var]
y_train = targets.values

im = pp.Imputer(strategy='median')
X_train = im.fit_transform(X_train)
X_scaler = pp.StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_scaler = pp.StandardScaler()
#y_train = y_scaler.fit_transform(y_train.reshape([-1,1]))

X_train = pd.DataFrame(X_train, columns=features_to_use)
y_train = pd.DataFrame(y_train, columns=target_var)
preprocess_dict = {'fillna': im, 'X_scaler': X_scaler, 'y_scaler': y_scaler}

del y_is_above_cut, y_is_below_cut, excl, targets, features

We have selected the most important features to use as found in some of the community work, specifically in [this notebook](https://www.kaggle.com/fernandocanteruccio/two-sigma-financial-modeling/xgboost-feature-importance-analysis). Lets take a peek in our dataset head again.

In [None]:
X_train.head()

Right! Now we have scaled values and without NaN values. Better this way!
Now we can start to build our models. This time we gonna try some deep neural network arquitetures and see how it performs. Let's get started!

In [None]:
# Model Definition
def dnn(shape,timesteps,l2_coef,drop_coef):
    model = Sequential()
    model.add(LSTM(shape[1], init='glorot_normal', W_regularizer=l2(l2_coef),
                   return_sequences=True, input_shape=(timesteps, shape[0])))
    model.add(LSTM(shape[2], init='glorot_normal', W_regularizer=l2(l2_coef),
                   return_sequences=False))
    model.add(Dense(shape[3], init='glorot_normal', activation='elu', W_regularizer=l2(l2_coef)))
    model.add(Dense(shape[4], init='glorot_normal', activation='linear', W_regularizer=l2(l2_coef)))

    optm = RMSprop(lr=0.007, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='mean_squared_error',
                  optimizer=optm,
                  metrics=['mean_squared_error'],
                  verbose=2)
    return model

Starting with just one layer of LSTM neurons and two fully-connected layers on the output.
Here we apply L2 regularization in an attempt to deal with overfiting. We also use some dropout and early-stop  techniques. 

In [None]:
# Training Routine
timesteps = 8
batch_size = 2**14
training_epochs = 15

print("Padding Sequences")
t0 = time()
X_train_ts = np.array([pad_sequences(X_train[col].values.reshape([-1,1]), maxlen=timesteps, dtype='float32') 
              for col in X_train]).transpose((1, 2, 0))
print("Done! Padding time:", time() - t0)
print("X_train shape:",X_train_ts.shape)
print("Training model")
t0 = time()
model1 = dnn(shape=[X_train_ts.shape[2],8,8,128,1],timesteps=timesteps,l2_coef=1e-9,drop_coef=1.0)
model1.fit(X_train_ts, y_train.values,
          nb_epoch=training_epochs,
          batch_size=batch_size,
          callbacks=[ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=1e-7,
                                       epsilon=0.0001, verbose=1),
                     EarlyStopping(monitor='loss', min_delta=0.00001, patience=12, verbose=1, mode='auto')],
          shuffle=False,
          verbose=2
          );

print("Done! Training time:", time() - t0)
del X_train, t0

### Analysing Training Results

In [None]:
print("Predicting target on training dataset")
t0 = time()
m1_preds = model1.predict(X_train_ts, batch_size=batch_size, verbose=0)
score = r2_score(y_train, m1_preds)
print("Done! Prediction time:",time() - t0)
print("R2 score for train dataset",score)

del X_train_ts, y_train, score, t0

In [None]:
market_df = observation.train[['timestamp', 'y']].groupby('timestamp').agg([np.mean, np.std]).reset_index()
y_mean = np.array(market_df['y']['mean'])
y_std = np.array(market_df['y']['std'])
t = market_df['timestamp']

cum_ret = np.log(1+y_mean).cumsum()
pred_ret = pd.DataFrame(np.vstack((observation.train.timestamp.loc[y_is_within_cut], m1_preds[:,0])).T,
                        columns=['timestamp','y']).groupby('timestamp').agg([np.mean, np.std]).reset_index()
pred_std = np.array(pred_ret['y']['std'])
cum_pred = np.log(1+pred_ret['y']['mean']).cumsum()

fig, ax = plt.subplots(figsize=(12,7))
ax.set_xlabel("Timestamp");
ax.set_title("Cumulative target signal and predictions over time");
sns.tsplot(cum_ret,t,ax=ax,color='b');
sns.tsplot(y_std,t,ax=ax,color='g');
sns.tsplot(cum_pred,t,ax=ax,color='r');
sns.tsplot(pred_std,t,ax=ax,color='black');
ax.set_ylabel('Target / Prediction');

fig, ax = plt.subplots(figsize=(12,7))
ax.set_title("Target Variable Distribution. (True vs Prediction)");
plt.ylim([0, 50000])
sns.distplot(observation.train.y ,ax=ax, color='b', kde=False, bins=100);
sns.distplot(m1_preds ,ax=ax, color='r', bins=100);
ax.set_ylabel('Target / Prediction');

weights = model1.get_weights()
fig, ax = plt.subplots(figsize=(12,7))
ax.set_title("Weights Variable Distribution.");
#plt.ylim([0, 2])
sns.distplot(weights[0].flatten() ,ax=ax, color='b');
sns.distplot(weights[1].flatten() ,ax=ax, color='r');
sns.distplot(weights[2].flatten() ,ax=ax, color='g');
sns.distplot(weights[3].flatten() ,ax=ax, color='black');
ax.set_ylabel('Occurences');
#print(weights[5])
del market_df, y_mean, y_std, t, cum_ret, pred_ret, cum_pred

### Using the model for predictions

In [None]:
# Predict-step-predict routine
def gen_preds(model, preprocess_dict, features_to_use, print_info=True):
    env = kagglegym.make()
    # We get our initial observation by calling "reset"
    observation = env.reset()

    im = preprocess_dict['fillna']
    X_scaler = preprocess_dict['X_scaler']
    y_scaler = preprocess_dict['y_scaler']
    
    reward = 0.0
    reward_log = []
    timestamp_log = []
    pred_log= []
    pos_count = 0
    neg_count = 0

    total_pos = []
    total_neg = []

    print("Predicting")
    t0= time()
    while True:
    #    observation.features.fillna(mean_values, inplace=True)

        # Predict with model
        features_dnn = im.transform(observation.features.loc[:,features_to_use].values)
        features_dnn = pd.DataFrame(X_scaler.transform(features_dnn),columns=features_to_use)
        features_dnn_ts = np.array([pad_sequences(features_dnn[col].values.reshape([-1,1]), maxlen=timesteps,
                                                  dtype='float32') for col in features_to_use]).transpose((1, 2, 0))

        y_dnn = model.predict(features_dnn_ts,batch_size=features_dnn.shape[0],
                              verbose=0).clip(low_y_cut, high_y_cut)

        # Fill target df with predictions 
#        observation.target.y = y_scaler.inverse_transform(y_dnn)
        observation.target.y = y_dnn
        observation.target.fillna(0, inplace=True)
        target = observation.target
        timestamp = observation.features["timestamp"][0]
        
        observation, reward, done, info = env.step(target)

        timestamp_log.append(timestamp)
        reward_log.append(reward)
        pred_log.append(y_dnn)

        if (reward < 0):
            neg_count += 1
        else:
            pos_count += 1

        total_pos.append(pos_count)
        total_neg.append(neg_count)
        
        if timestamp % 100 == 0:
            if print_info:
                print("Timestamp #{}".format(timestamp))
                print("Step reward:", reward)
                print("Mean reward:", np.mean(reward_log[-timestamp:]))
                print("Positive rewards count: {0}, Negative rewards count: {1}".format(pos_count, neg_count))
                print("Positive reward %:", pos_count / (pos_count + neg_count) * 100)

            pos_count = 0
            neg_count = 0

        if done:
            break
    print("Done: %.1fs" % (time() - t0))
    print("Total reward sum:", np.sum(reward_log))
    print("Final reward mean:", np.mean(reward_log))
    print("Total positive rewards count: {0}, Total negative rewards count: {1}".format(np.sum(total_pos),
                                                                                        np.sum(total_neg)))
    print("Final positive reward %:", np.sum(total_pos) / (np.sum(total_pos) + np.sum(total_neg)) * 100)
    print(info)
    return np.array(pred_log), np.array(reward_log), np.array(timestamp_log)

pred_log, reward_log, timestamp_log = gen_preds(model1, preprocess_dict, features_to_use)