<a href="https://colab.research.google.com/github/leopedroso1/StockMarket_Bot/blob/main/Stock_Market_Trading_Bot_v_1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

from datetime import datetime

import itertools
import argparse
import re
import os
import pickle

from sklearn.preprocessing import StandardScaler

In [3]:
# Gathering Data

def get_data():
  # Returns a T x 3 list of stock prices
  # Each row contains a different stock
  # 0 - AAPL
  # 1 - MST
  # 2 - SBUX

  df = pd.read_csv("aapl_msi_sbux.csv")

  return df.values

In [5]:
# Replay Buffer
# This is the experience replay in the memory
class ReplayBuffer:

  # Initialize our Buffer
  # ptr stands for pointer
  def __init__(self, obs_dim, act_dim, size):

    self.obs1_buf= np.zeros([size, obs_dim], dtype=np.float32)
    self.obs2_buf= np.zeros([size, obs_dim], dtype=np.float32)
    self.acts_buf= np.zeros(size, dtype= np.unit8)
    self.rews_buf= np.zeros(size, dtype= np.float32)
    self.done_buf= np.zeros(size, dtype= np.unit8)
    self.ptr, self.size, self.max_size= 0, 0, size

  # Store elemens in the buffer as a carousel
  def store(self, obs, act, rew, next_obs, done):

    self.obs1_buf[self.ptr]= obs
    self.obs2_buf[self.ptr]= next_obs
    self.acts_buf[self.ptr]= act
    self.rews_buf[self.ptr]= rew
    self.done_buf[self.ptr]= done
    self.ptr= (self.ptr + 1) % self.max_size
    self.size= min(self.size + 1, self.max_size)

  # Generate a random batch from 0 to the size of the buffer
  def sample_batch(self, batch_size= 32):

    idxs = np.random.randint(0, self.size, size= batch_size)
    return dict(s= self.obs1_buf[idxs],
                s2= self.obs2_buf[idxs],
                a= self.acts_buf[idxs],
                r= self.rews_buf[idxs],
                d= self.done_buf[idxs])

  # Return a scikit-learn scaler object to scale the states
  # Note: You could also populate the replay buffer here
  # Run for multiple episodes in order to get accuracy
  def get_scaler(env):

    states = []

    for _ in range(env.n_step):

      action= np.random.choice(env.action_space)
      state, reward, done, info = env.step(action)
      states.append(state)

      if done:

        break

    scaler = StandardScaler()
    scaler.fit(states)

    return scaler
  
  def maybe_make_dir(directory):

    if not os.path.exists(directory):

      os.makedirs(directory)

  # Create our model ==> Multi Layer Perceptron
  def mlp(input_dim, n_action, n_hidden_layers= 1, hidden_dim= 32):

    # Input Layer
    i = Input(shape= (input_dim,))
    x = i

    # Hidden Layers
    for _ in range(n_hidden_layers):

      x = Dense(hidden_dim, activation='relu')(x)

    # Output Layer
    x = Dense(n_action)(x)

    # Build the model
    model = Model(i, x)
    model.compile(loss= 'mse', optimizer= 'adam')
    print(model.summary())

    return model


In [None]:
class MultiStockEnv:

  """
  A 3-Stock trading environment 

  State: Vector of size 7 (n_stock * 2 + 1)
    - # shares of stock 1 owned
    - # shares of stock 2 owned
    - # shares of stock 3 owned
    - Price of stock 1 (using only close price)
    - Price of stock 2 (using only close price)
    - Price of stock 3 (using only close price)
    - cash owned (Can be used to purchase more stocks)
  
  Action: categorical variable with 27 (3^3) possibilities
    - for each stock, you can:
    - 0 = sell
    - 1 = hold
    - 2 = buy

  """

  def __init__(self, data, initial_investment= 20000):

    # Loading data
    self.stock_price_history= data
    self.n_step, self.n_stock= self.stock_price_history.shape

    # Instance attributes
    self.initial_investment= initial_investment
    self.cur_step= None
    self.stock_owned= None
    self.stock_price= None
    self.cash_in_hand= None

    self.action_space= np.arrange(3 ** self.n_stock)

    # Action Permutations
    # Returns a nested list with elements like:
    # [0,0,0] >> Sell all stocks
    # [0,0,1] >> Sell 1st and 2nd stock but hold 3rd
    # [0,0,2]
    # [0,1,0]
    # [0,1,1]...
    # Where:
    # 0 = sell
    # 1 = hold
    # 2 = buy
    self.action_list= list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))

    # Calculate the size of state
    self.sate_dim= self.n_stock * 2 + 1

    self.reset()

  def reset():

    self.cur_step= 0
    self.stock_owned= np.zeros(self.n_stock)
    self.stock_price= self.stock_price_history[self.cur_step]
    self.cash_in_hand= self.initial_investment
    
    return self._get_obs()
  
  def step(self, action):

    assert action in self.action_space

    # Get current value before performing the action
    prev_val= self._get_val()

    # Update price, i.e. go to the next day
    self.cur_step += 1
    self.stock_price= self.stock_price_history[self.cur_step]

    # Perform trade
    self._trade(action)

    # Get the new value after taking the action
    cur_val= self._get_val()

    # Reward is the increase in Portifolio value
    reward= cur_val - prev_val

    # Done if we have run out of data
    done= self.cur_step == self.n_step - 1

    # Store the current value of the portifolio here
    info = {'cur_val': cur_val}

    # Return the value similarly to Gym API
    return self._get_obs(), reward, done, info

  def _get_obs(self):

    obs= np.empty(self.state_dim)
    obs[:self.n_stock]= self.stock_owned
    obs[self.n_stock: 2 * self.n_stock]= self.stock_price
    obs[-1]= self.cash_in_hand

    return obs

  def _get_val(self):

    return self.stock_owned.dot(self.stock_price) + self.cash_in_hand

  def _trade(self, action):

    # Recall: Index the action to be performed
    # 0 --> Sell
    # 1 --> Hold
    # 2 --> Buy

    # e.g
    # [0, 2, 1] Stock 1 >> Sell / Stock 2 >> Buy / Stock 3 >> Hold

    action_vec= self.action_list[action]

    # Determine which stocks to buy or sell
    sell_index= [] # >> Stores index to sell
    buy_index= [] # >> Stores index to buy

    for i, a in enumerate(action_vec):

      if a == 0:
        sell_index.append(i)
      
      elif a== 2:
        buy_index.append(i)

    # Sell any stocks we want to sell then buy any stocks we want to buy
    if sell_index:

      # NOTE: To simplify the problem, when we sell, we will sell ALL shares of that stock
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
      
    if buy_index:

      # NOTE: When buying, we will loop through each stock we want to buy and buy one share at a time until we run out of cash. Buy as much as we can!
      for i in buy_index:

        can_buy= True
        while can_buy:
          for i in buy_index:

            if self.cash_in_hand > self.stock_price[i]:
              self.stock_owned[i] += 1
              self.cash_in_hand -= self.stock_price[i]

            else:
              can_buy= False

In [None]:
class DQNAgent(object):

  def __init__(self, state_size, action_size):

    self.state_size= state_size
    self.action_size= action_size
    self.memory= ReplayBuffer(state_size, action_size, size= 500)
    self.gamma= 0.95 # Discount Rate
    self.epsilon= 1.0 # Exploration Rate
    self.epsilon_min= 0.01
    self.epsilon_decay= 0.995
    self.model= mlp(state_size, action_size)

    def update_replay_memory(self, state, action, reward, next_state, done):
      self.memory.store(state, action, reward, next_state, done)

    def act(self, state):

      if np.random.rand() <= self.epsilon:

        return np.random.choice(self.action_size)
      
      act_values= self.model.predict(state)
      
      return np.argmax(act_values[0]) # Returns the action
      
    def replay(self, batch_size= 32):

      # First check if replay buffer contains enough data
      if self.memory.size < batch_size:
        return
      
      # Sample a batch of data from the replay memory
      minibatch= self.memory.sample_batch(batch_size)
      states= minibatch['s']
      actions= minibatch['a']
      rewards= minibatch['r']
      next_states= minibatch['s2']
      done= minibatch['d']

      # Calculate the tentative target Q(s', a)
      target= rewards + self.gama * np.amax(self.model.precit(next_states), axis=1)

      # The value of terminal states is Zero, so set the target to be the reward only
      target[done]= rewards[done]

      # With Keras API, the target (usually) must have the same shape of its predictions
      # However, we only need to update the network for the actions which were actually taken
      # We can accomplish this by setting the target to be equal to the prediction for all values
      # Then, only charge the targets for the actions taken Q(s, a)
      target_full= self.model.predict(states)
      target_full[np.arrange(batch_size), actions]= target

      # Run on training step
      self.model.train_on_batch(states, target_full)

      if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
      
    def load(self, name):
      self.model.load_weights(name)

    def save(self, name):
      self.model.save_weights(name)

In [None]:
def play_one_episode(agent, env, is_train):
  # NOTE: After transforming, states are already 1 x D
  state= env.reset()
  state= scaler.transform([state])
  done= False

  while not done:
    action= agent.act(state)
    next_state, reward, done, info= env.step(action)
    next_state= scaler.transform([next_state])

    if is_train == 'train':
    
      agent.update_replay_memory(state, action, reward, next_state, done)
      agent.replay(batch_size)

    state = next_state

  return info['cur_val'] # Current value of our portifolio

if __main__ = '__main__':

  # Config setup
  models_folder= 'rl_trader_models'
  rewards_folder= 'rl_trader_rewards'
  num_episodes= 2000
  batch_size= 32
  initial_investment= 20000

  # Allows us run this code with command line arguments
  parser= argparse.ArgumentParser()
  parser.add_argument('-m', '--mode', type=str, required= True, help='either "train" or "test"')

  args= parser.parse_args()

  maybe_make_dir(models_folder)
  maybe_make_dir(rewards_folder)

  data= get_data()
  n_timesteps, n_stocks = data.shape

  n_train = n_timesteps // 2 

  train_data = data[:n_train]
  test_data = data[n_train:]

  env= MultiStockEnv(train_data, initial_investment)
  state_size= env.state_dim
  action_size= len(env.action_space)
  agent= DQNAgent(state_size, action_size)
  scaler= get_scaler(env)

  # Store the final value of the portifolio (End of episode)
  portifolio_value= []

  if args.mode == "test":

    # Then load the previous scaler
    with open(f'{models_folder}/scaler.pkl','rb') as f:
      scaler= picke.load()

    # Remake the env with test data
    env= MultiStockEnv(test_data, initial_investment)

    # Make sure epsilon is not 1
    # No need to run multiple episodes if epsilon= 0, it is deterministic
    agent.epsilon= 0.01

    # Load trained weights
    agent.load(f'{models_folder}/dqn.h5')

  for e in range(num_episodes):
    t0= datetime.now()
    val= play_one_episode(agent, env, args.mode)
    dt= datetime.now() - t0
    print(f"episode: {e + 1} / {num_episodes}, episode end value: {val:.2f}, duration: {dt}")
    portifolio_value.append(val) # Append episode end portifolio value

  # Save the weights we are done 
  if agrs.mode == "train":

    # Save DQN
    agent.save(f'{models_folder}/dqn.h5')

    # Save Scaler
    with open(f'{models_folder}/scaler.pkl','wb') as f:
      pickle.dump(scaler, f)
  
  np.save(f'{rewards_folder}/{args.mode}.npy', portfolio_value)



# TO EXECUTE: python rl_trader.py -m train && python plot_rl_rewards.py -m train
# TO EXECUTE: python rl_trader.py -m test && python plot_rl_rewards.py -m test
# TEST WITH OTHER STOCKS THAT DECREASES THE VALUE AND EXPLORE NEW WAYS 

In [None]:
# Plotting the rewards that we saved

import matplotlib.pyplot as plt
import numpy as np
import argparse

parser= argparse.ArgumentParser()
parser.add_argument('-m', '--mode', type=str, required= True, help='either "train" or "test" ')

args= parser.parse_args()

a = np.load(f'linear_rl_trader_rewards/{args.mode}.npy')

print(f"average reward: {a.mean().2f}, min: {a.min():.2f}, max:{a.max():.2f}")

plt.hist(a, bins=20)
plt.title(args.mode)
plt.show()