In [340]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
import os
import re
import itertools
import argparse
import pickle

from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.autograd import Variable

In [341]:
def get_data():
  df = pd.read_csv("/content/aapl_msi_sbux.csv")
  return df.values

In [342]:
MEAN_FIT = 0
STD_FIT = 0

In [343]:
def get_scaler(env):
  states = []
  for i in range(env.n_steps):
    action = np.random.choice(env.action_space)
    state, reward, done, info = env.step(action)
    states.append(state)
    if done:
      break
  states_arr = [state.numpy() for state in states]
  states_arr = np.array(states_arr)
  MEAN_FIT = states_arr.mean()
  STD_FIT = states_arr.std()

In [344]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [345]:
class Model(nn.Module):
  def __init__(self, input_dim, n_action):
    super(Model, self).__init__()
    self.input_dim = input_dim
    self.n_action = n_action
    self.n_hidden = 2 * input_dim // 3

    self.model = nn.Sequential(
      nn.Linear(self.input_dim, self.n_hidden),
      nn.ReLU(),
      nn.Linear(self.n_hidden, self.n_action)
    )

  def forward(self, X):
    out = self.model(X)
    return out

In [346]:
class LinearModel:
  "Linear Regression Model"
  def __init__(self, input_dim, n_action):
    self.W = np.random.randn(input_dim, n_action) / np.sqrt(input_dim)
    self.b = np.zeros(n_action)

    self.momentumW = 0
    self.momentumb = 0

    self.losses = []

  def predict(self, data):
    assert(len(data.shape) == 2)
    return data.dot(self.W) + self.b

  def sgd(self, X, Y, lr=0.01, momentum=0.9):
    assert(len(X.shape) == 2)
    num_values = np.prod(X.shape)

    Yhat = self.predict(X)
    gradW = 2 * X.T.dot(Yhat - Y) / num_values
    gradb = 2 * (Yhat - Y).sum(axis=0) / num_values

    self.momentumW = momentum * self.momentumW - lr * gradW
    self.momentumb = momentum * self.momentumb - lr * gradb

    self.W += self.momentumW
    self.b += self.momentumb

    mse = np.mean((Yhat - Y) ** 2)
    self.losses.append(mse)

  def load_weights(self, filepath):
    npz = np.load(filepath)
    self.W = npz['W']
    self.b = npz['b']

  def save_weights(self, filepath):
    np.savez(filepath, W=self.W, b=self.b)

In [347]:
class MultiStockEnv:
  """
  A 3-stock trading environment.
  State: vector of size 7 (n_stock * 2 + 1)
    - number of shares of stock 1 owned
    - number of shares of stock 2 owned
    - number of shares of stock 3 owned
    - price of stock 1 (using daily close price)
    - price of stock 2
    - price of stock 3
    - cash owned (can be used to purchase more stocks)
  Action: categorical variable with 27 (3^3) possibilities
    - for each stock, you can:
    - 0 = sell
    - 1 = hold
    - 2 = buy
  """
  def __init__(self, data, initial_investment=20000):
    # data
    self.stock_price_history = data
    self.n_steps, self.n_stocks = self.stock_price_history.shape

    self.initial_investment = initial_investment

    self.action_space = np.arange(3 ** self.n_stocks)
    # action permutations
    # returns a nested list with elements like:
    # [0,0,0]
    # [0,0,1]
    # [0,0,2]
    # [0,1,0]
    # [0,1,1]
    # etc.
    # 0 = sell
    # 1 = hold
    # 2 = buy
    self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stocks)))
    self.state_dim = 2 * self.n_stocks + 1
    self.reset()

  def reset(self):
    self.cur_step = 0
    self.stock_owned = np.zeros(self.n_stocks)
    self.stock_price = self.stock_price_history[self.cur_step]
    self.cash_in_hand = self.initial_investment
    return self.get_observant()

  def step(self, action):
    assert action in self.action_space
    prev_value = self.get_value()

    self.cur_step += 1
    self.stock_price = self.stock_price_history[self.cur_step]

    self.trade(action)
    current_value = self.get_value()
    reward = current_value - prev_value

    done = self.cur_step == self.n_steps - 1
    info = {"current_value": current_value}

    return self.get_observant(), reward, done, info

  def get_observant(self):
    observant = np.empty(self.state_dim)
    observant[:self.n_stocks] = self.stock_owned
    observant[self.n_stocks:2 * self.n_stocks] = self.stock_price
    observant[-1] = self.cash_in_hand
    return torch.Tensor(observant)

  def get_value(self):
    return self.stock_owned.dot(self.stock_price) + self.cash_in_hand

  # index the action we want to perform
  # 0 = sell
  # 1 = hold
  # 2 = buy
  # e.g. [2,1,0] means:
  # buy first stock
  # hold second stock
  # sell third stock
  def trade(self, action):
    action_vec = self.action_list[action]
    sell_index = [idx for idx, act in enumerate(action_vec) if act == 0]
    buy_index = [idx for idx, act in enumerate(action_vec) if act == 2]

    # sell before buy
    if sell_index:
      for idx in sell_index:
        self.cash_in_hand += self.stock_price[idx] * self.stock_owned[idx]
        self.stock_owned[idx] = 0

    if buy_index:
      can_buy = True
      while can_buy:
        for idx in buy_index:
          if self.cash_in_hand > self.stock_price[idx]:
            self.cash_in_hand -= self.stock_price[idx]
            self.stock_owned[idx] += 1
          else:
            can_buy = False

In [348]:
class DQNAgent:
    def __init__(self, state_size, action_size):
      self.model = Model(state_size, action_size)
      self.state_size = state_size
      self.action_size = action_size
      self.discount_rate = 0.95
      self.epsilon = 1.0
      self.epsilon_min = 0.01
      self.epsilon_decay = 0.995
      self.criterion = nn.MSELoss()
      self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=0.01)
      self.losses = []

    def act_(self, state):
      if np.random.random() < self.epsilon:
        return np.random.choice(self.action_size)
      act_values = self.model(state)
      return torch.argmax(act_values).item()

    def train(self, state, action, reward, next_state, done):
      if done:
        target = reward
      else:
        target = reward + self.discount_rate * torch.max(self.model(next_state)).item()

      target_full = self.model(state)
      target_full[action] = target

      # train
      self.optimizer.zero_grad()
      outputs = self.model(state)
      loss = self.criterion(outputs, target_full)

      # Backward and Optimize
      loss.backward()
      self.optimizer.step()

      self.losses.append(loss.item())

      if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay

    def load(self, name):
      torch.load(name, map_location=torch.device('cpu'))

    def save(self, name):
      torch.save(self.model, name)

In [349]:
def play_one_episode(agent, env, is_train):
  state = env.reset()
  # PyTorch impl
  state -= MEAN_FIT
  state /= STD_FIT + 1e-7
  done = False
  while not done:
    action = agent.act_(state)
    next_state, reward, done, info = env.step(action)
    next_state -= MEAN_FIT
    next_state /= STD_FIT + 1e-7
    if is_train == 'train':
      agent.train(state, action, reward, next_state, done)
    state = next_state

  return info['current_value']

In [352]:
# config
MODELS_FOLDER = '/content/rl_trader_models'
REWARDS_FOLDER = '/content/rl_trader_rewards'
NUM_EPISODES = 2000
BATCH_SIZE = 32
INITIAL_INVESTMENT = 2000
MODE = "test"

In [353]:
if __name__ == "__main__":

  if not os.path.exists(MODELS_FOLDER):
    os.makedirs(MODELS_FOLDER)
  if not os.path.exists(REWARDS_FOLDER):
    os.makedirs(REWARDS_FOLDER)

  data = get_data()
  n_timesteps, n_stocks = data.shape
  n_train = n_timesteps // 2

  train_data = data[:n_train]
  test_data = data[n_train:]

  env = MultiStockEnv(train_data, INITIAL_INVESTMENT)
  state_size = env.state_dim
  action_size = len(env.action_space)
  agent = DQNAgent(state_size, action_size)
  scaler = get_scaler(env)

  portfolio_value = []

  # to really test the algorithm choose stocks that go up and down
  if MODE == "test":
    with open(f'{MODELS_FOLDER}/scaler.pkl', 'rb') as f:
      scaler = pickle.load(f)

    env = MultiStockEnv(test_data, INITIAL_INVESTMENT)

    # make sure epsilon is not 1!
    # no need to run multiple episodes if epsilon = 0, it's deterministic
    agent.epsilon = 0.01
    agent.load(f'{MODELS_FOLDER}/rl_trader_weight.pt')

  # play the game num_episodes times
  for episode in range(NUM_EPISODES):
    t0 = datetime.now()
    value = play_one_episode(agent, env, MODE)
    dt = datetime.now() - t0
    print(f"episode: {episode + 1}/{NUM_EPISODES}, episode end value: {value:.2f}, duration: {dt}")
    portfolio_value.append(value)

  # save the weights when we are done
  if MODE == 'train':
    # save the DQN
    agent.save(f'{MODELS_FOLDER}/rl_trader_weight.pt')

    # save the scaler
    with open(f'{MODELS_FOLDER}/scaler.pkl', 'wb') as f:
      pickle.dump(scaler, f)

    # plot losses
    plt.plot(agent.losses)
    plt.show()


  # save portfolio value for each episode
  np.save(f'{REWARDS_FOLDER}/{MODE}.npy', portfolio_value)


episode: 1/2000, episode end value: 4136.45, duration: 0:00:00.117562
episode: 2/2000, episode end value: 3148.48, duration: 0:00:00.136021
episode: 3/2000, episode end value: 4242.41, duration: 0:00:00.129025
episode: 4/2000, episode end value: 3234.11, duration: 0:00:00.121943
episode: 5/2000, episode end value: 3016.93, duration: 0:00:00.143030
episode: 6/2000, episode end value: 5047.30, duration: 0:00:00.118399
episode: 7/2000, episode end value: 3232.87, duration: 0:00:00.129664
episode: 8/2000, episode end value: 3245.83, duration: 0:00:00.112087
episode: 9/2000, episode end value: 2922.13, duration: 0:00:00.123033
episode: 10/2000, episode end value: 3209.54, duration: 0:00:00.129416
episode: 11/2000, episode end value: 3161.76, duration: 0:00:00.125345
episode: 12/2000, episode end value: 3212.16, duration: 0:00:00.113212
episode: 13/2000, episode end value: 2978.98, duration: 0:00:00.116435
episode: 14/2000, episode end value: 3221.08, duration: 0:00:00.111567
episode: 15/200