# Jane Street: Deep Reinforcement Learning Approach

I try implementing a deep Q-Network to solve the prediction problem.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from torch.distributions import Categorical

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

import os, gc, random
if device == 'cuda':
    import cudf
    import cupy as cp
import pandas as pd
import numpy as np
import janestreet
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load

cpu


In [2]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

# Preprocessing

In [3]:
# print('Loading...')
# if device == 'cuda':
#     train = cudf.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
# else:
#     train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv', nrows = 3)
features = [c for c in train.columns if 'feature' in c]

# print('Filling...')
# f_mean = train[features[1:]].mean()
# f_std = train[features[1:]].std()
# train = train.query('weight > 0').reset_index(drop = True)
# train[features[1:]] = train[features[1:]].fillna(f_mean)
# train[features[1:]] = (train[features[1:]] - f_mean) / f_std
# train['action'] = (train['resp'] > 0).astype('int')

# print('Converting...')
# if device == 'cuda':
#     train = train.to_pandas()
#     f_mean = f_mean.values.get()
#     f_std = f_std.values.get()
# else:
#     f_mean = f_mean.values
#     f_std = f_std.values
# np.save('f_mean.npy', f_mean)
# np.save('f_std.npy', f_std)

# print('Finish.')

In [4]:
# def utility_score(date, weight, resp, action):
#     count_i = len(np.unique(date))
#     Pi = np.bincount(date, weight * resp * action)
#     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
#     u = np.clip(t, 0, 6) * np.sum(Pi)
#     return u

# def utility_score_pd(date, weight, resp, action):
#     count_i = len(pd.unique(date))
#     Pi = np.bincount(date, weight * resp * action)
#     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
#     u = np.clip(t, 0, 6) * np.sum(Pi)
#     return u

# def utility_score_max(date, weight, resp, action):
#     count_i = date.max() + 1
#     Pi = np.bincount(date, weight * resp * action)
#     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
#     u = np.clip(t, 0, 6) * np.sum(Pi)
#     return u

# def utility_score_last(date, weight, resp, action):
#     count_i = date[-1] + 1
#     Pi = np.bincount(date, weight * resp * action)
#     t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
#     u = np.clip(t, 0, 6) * np.sum(Pi)
#     return u

In [5]:
# %timeit utility_score(train['date'].values, train['weight'].values, train['resp'].values, train['action'].values)
# %timeit utility_score_pd(train['date'].values, train['weight'].values, train['resp'].values, train['action'].values)
# %timeit utility_score_max(train['date'].values, train['weight'].values, train['resp'].values, train['action'].values)
# %timeit utility_score_last(train['date'].values, train['weight'].values, train['resp'].values, train['action'].values)

# DQN Model Functions

modified from https://github.com/MoMe36/DuelingDDQN

In [6]:
class JSEnv:
    
    def __init__(self, df, feats):

        self.n_samples = df.shape[0]
        self.weight = torch.FloatTensor(df['weight'].values)
        self.resp = torch.FloatTensor(df['resp'].values)
        self.states = torch.FloatTensor(df[feats].values)
        self.observation_space = df[feats].shape[1]
        self.action_space = 2
        self.idx = 0
    
    def reset(self):
        self.idx = 0
        return self.states[self.idx].view(1, -1)
    
    def step(self, action):
        reward = self.weight[self.idx] * self.resp[self.idx] * action
        self.idx += 1
        if self.idx >= self.n_samples:
            done = True
            self.idx = 0
        else:
            done = False
        info = 0
        return self.states[self.idx].view(1, -1), reward, done, info

In [7]:
import numpy as np 
import gym 
import torch 
import random
from argparse import ArgumentParser 
import os 
import pandas as pd 

import matplotlib.pyplot as plt
plt.style.use('ggplot')
from scipy.ndimage.filters import gaussian_filter1d
from IPython.display import display, clear_output

def save(agent, rewards): 

    torch.save(agent.q.state_dict(), 'Net.pt')

#     plt.gca().cla()
#     plt.plot(rewards, c = 'r', alpha = 0.3)
#     plt.plot(gaussian_filter1d(rewards, sigma = 5), c = 'r', label = 'Rewards')
#     plt.xlabel('Frames x 1000')
#     plt.ylabel('Cumulative reward')
#     plt.title('Dueling DDQN: JS')
#     plt.legend()
#     plt.savefig('reward.png')
#     plt.show()
    
#     clear_output(wait = True)
#     plt.pause(0.5)

#     pd.DataFrame(rewards, columns = ['Reward']).to_csv('rewards.csv', index = False)

class AgentConfig:

    def __init__(self, 
                 epsilon_start = 1.,
                 epsilon_final = 0.01,
                 epsilon_decay = 8000,
                 gamma = 0.99, 
                 lr = 1e-4, 
                 target_net_update_freq = 1000, 
                 memory_size = 100000, 
                 batch_size = 128, 
                 learning_starts = 5000,
                 max_frames = 10000000): 

        self.epsilon_start = epsilon_start
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.epsilon_by_frame = lambda i: self.epsilon_final + (self.epsilon_start - self.epsilon_final) * np.exp(-1. * i / self.epsilon_decay)

        self.gamma =gamma
        self.lr =lr

        self.target_net_update_freq =target_net_update_freq
        self.memory_size =memory_size
        self.batch_size =batch_size

        self.learning_starts = learning_starts
        self.max_frames = max_frames

class ExperienceReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        
        batch = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        next_states = [] 
        dones = []

        for b in batch: 
            states.append(b[0])
            actions.append(b[1])
            rewards.append(b[2])
            next_states.append(b[3])
            dones.append(b[4])

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.memory)

In [8]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from torch.distributions import Categorical 

class DuelingNetwork(nn.Module): 

    def __init__(self, obs, ac): 

        super().__init__()

        self.model = nn.Sequential(nn.utils.weight_norm(nn.Linear(obs, 512)),
                                   nn.ReLU(), 
                                   nn.utils.weight_norm(nn.Linear(512, 256)),
                                   nn.ReLU(),
                                  )

        self.value_head = nn.utils.weight_norm(nn.Linear(256, 1))
        self.adv_head = nn.utils.weight_norm(nn.Linear(256, ac))

    def forward(self, x): 

        out = self.model(x)

        value = self.value_head(out)
        adv = self.adv_head(out)

        q_val = value + adv - adv.mean(1).reshape(-1,1)
        return q_val

In [9]:
from tqdm.notebook import tqdm
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from torch.distributions import Categorical 

import numpy as np 
import gym 
import random 

class DuelingDDQN(nn.Module): 

    def __init__(self, obs, ac, config): 

        super().__init__()

        self.q = DuelingNetwork(obs, ac).to(device)
        self.target = DuelingNetwork(obs, ac).to(device)

        self.target.load_state_dict(self.q.state_dict())

        self.target_net_update_freq = config.target_net_update_freq
        self.update_counter = 0

    def get_action(self, x):
        
        x = torch.FloatTensor(x).to(device)
        with torch.no_grad(): 
            a = self.q(x).max(1)[1]

        return a.item()

    def update_policy(self, adam, memory, params): 

        b_states, b_actions, b_rewards, b_next_states, b_masks = memory.sample(params.batch_size)

        states = torch.FloatTensor(b_states).to(device)
        actions = torch.LongTensor(b_actions).reshape(-1,1).to(device)
        rewards = torch.FloatTensor(b_rewards).reshape(-1,1).to(device)
        next_states = torch.FloatTensor(b_next_states).to(device)
        masks = torch.FloatTensor(b_masks).reshape(-1,1).to(device)

        current_q_values = self.q(states).gather(1, actions)

        # print(current_q_values[:5])

        with torch.no_grad():

            max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1,1)
            # max_next_q_vals = self.
        expected_q_vals = rewards + max_next_q_vals*0.99*masks
        # print(expected_q_vals[:5])
        loss = F.mse_loss(expected_q_vals, current_q_values)

        # input(loss)

        # print('\n'*5)
        
        adam.zero_grad()
        loss.backward()

        for p in self.q.parameters(): 
            p.grad.data.clamp_(-1.,1.)
        adam.step()

        self.update_counter += 1
        if self.update_counter % self.target_net_update_freq == 0: 
            self.update_counter = 0 
            self.target.load_state_dict(self.q.state_dict())

# Train Agent

In [10]:
# env = JSEnv(train, features)       
# config = AgentConfig(epsilon_start = 1.,
#                      epsilon_final = 0.01,
#                      epsilon_decay = 8000,
#                      gamma = 0.99, 
#                      lr = 1e-4, 
#                      target_net_update_freq = 1000, 
#                      memory_size = env.n_samples // 100, 
#                      batch_size = 128, 
#                      learning_starts = 5000,
#                      max_frames = env.n_samples)
# memory = ExperienceReplayMemory(config.memory_size)
# agent = DuelingDDQN(env.observation_space, env.action_space, config)
# adam = optim.Adam(agent.q.parameters(), lr = config.lr) 

# s = env.reset()
# ep_reward = 0. 
# recap = []
# cum_rewards = []

# p_bar = tqdm(total = config.max_frames)
# for frame in range(config.max_frames):

#     epsilon = config.epsilon_by_frame(frame)

#     if np.random.random() > epsilon: 
#         action = agent.get_action(s)
#     else: 
#         action = np.random.randint(0, env.action_space)

#     ns, r, done, infos = env.step(action)
#     ep_reward += r 
#     if done:
#         ns = env.reset()
#         recap.append(ep_reward)
#         p_bar.set_description('Rew: {:.3f}'.format(ep_reward))
#         ep_reward = 0.

#     memory.push((s.reshape(-1).numpy().tolist(), action, r, ns.reshape(-1).numpy().tolist(), 0. if done else 1.))
#     s = ns  

#     p_bar.update(1)

#     if frame > config.learning_starts:
#         agent.update_policy(adam, memory, config)

#     if frame % 1000 == 0:
#         print(f'{frame + 1}/{config.max_frames}:', ep_reward.item(), end = '\r')
# #         cum_rewards.append(ep_reward.item())
#         save(agent, cum_rewards)

# p_bar.close()

# Load Model

In [11]:
# checkpoint_path = './Net.pt'
checkpoint_path = '../input/js-dqn/Net.pt'

model = DuelingNetwork(len(features), 2).to(device)
model.load_state_dict(torch.load(checkpoint_path, map_location = device))
model.eval()

DuelingNetwork(
  (model): Sequential(
    (0): Linear(in_features=130, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
  )
  (value_head): Linear(in_features=256, out_features=1, bias=True)
  (adv_head): Linear(in_features=256, out_features=2, bias=True)
)

In [12]:
# f_mean = np.load('./f_mean.npy')
# f_std = np.load('./f_std.npy')

f_mean = np.load('../input/js-dqn/f_mean.npy')
f_std = np.load('../input/js-dqn/f_std.npy')

In [13]:
test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
test[features[1:]] = test[features[1:]].fillna(dict(zip(features[1:], f_mean)))
test[features[1:]] = (test[features[1:]] - f_mean) / f_std
pred = model(torch.FloatTensor(test[features].values).to(device)).detach().cpu().numpy()
print(pred)

[[ 0.70579064 -0.04924998]
 [ 0.70596105  0.48139542]
 [ 0.68286705  0.2586545 ]
 ...
 [ 0.7388593   0.5991102 ]
 [ 0.70749444  0.57120484]
 [ 0.55012345  0.16774449]]


# Submitting

In [14]:
env = janestreet.make_env()
env_iter = env.iter_test()

In [15]:
for (test_df, pred_df) in tqdm(env_iter):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        x_tt[:, 1:] = (x_tt[:, 1:] - f_mean) / f_std
        x_tt = torch.FloatTensor(x_tt).to(device)
        pred_df.action = model(x_tt).max(1)[1].cpu().numpy().item()
    else:
        pred_df.action = 0
    env.predict(pred_df)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


