In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from datetime import datetime
import itertools
import argparse
import re
import os
import pickle
from sklearn.preprocessing import StandardScaler
import time
import datetime

In [2]:
import alpaca_trade_api as alpaca
import matplotlib.pyplot as plt
import json
import requests
import yfinance as yf

# Config

In [3]:

end_point = 'https://paper-api.alpaca.markets'
api_key = 'PKB8RXANU63G9WACCHBT'
api_secret='kwP24NfXpyGpp2tsfBvsUaARcjnKkANhEcKQeqoX'


api = alpaca.REST(api_key, api_secret, end_point, api_version='v2')


# Helper function

In [4]:
# This function will calculate the number of units one can afford given cash to spend and latest price, and round it down according to order of the precision factor.
def calculate_order_size(cash_to_spend, latest_price):
 precision_factor = 10000
 units_to_buy = float(cash_to_spend * precision_factor / latest_price)
 units_to_buy /= precision_factor
 return units_to_buy

In [5]:
import time

MAX_RETRIES = 10
sleep_time = 3

def buy(symbol, qty):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            api.submit_order(symbol=symbol, qty=qty, side='buy', type='market', time_in_force='gtc')
            print("Order placed successfully!")
            return  # Exit the function if the order is placed successfully
        except Exception as e:
            print(f"Error occurred while placing the order: {e}")
            retries += 1
            time.sleep(sleep_time)  # Add a small delay before retrying
    print("Failed to place order after multiple attempts.")

def sell(symbol, qty):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            api.submit_order(symbol=symbol, qty=qty, side='sell', type='market', time_in_force='gtc')
            print("Order placed successfully!")
            return  # Exit the function if the order is placed successfully
        except Exception as e:
            print(f"Error occurred while placing the order: {e}")
            retries += 1
            time.sleep(sleep_time)  # Add a small delay before retrying
    print("Failed to place order after multiple attempts.")

def get_cash():
    retries = 0
    while retries < MAX_RETRIES:
        try:
            account = api.get_account()
            return float(account.cash)
        except Exception as e:
            print("An error occurred while getting the account balance:", e)
            retries += 1
            time.sleep(sleep_time)  # Add a small delay before retrying
    print("Failed to get account balance after multiple attempts.")
    return 0.0  # Return a default value or appropriate error handling

def get_buying_power():
    retries = 0
    while retries < MAX_RETRIES:
        try:
            account = api.get_account()
            return float(account.buying_power)
        except Exception as e:
            print("An error occurred while getting the buying power:", e)
            retries += 1
            time.sleep(sleep_time)  # Add a small delay before retrying
    print("Failed to get buying power after multiple attempts.")
    return 0.0  # Return a default value or appropriate error handling


def get_qty_avaiable(symbol):
    position = api.get_position(symbol)
    return float(position.qty_available)

def get_portfolio_value():
    portfolio = api.get_account()
    return float(portfolio.portfolio_value)


In [6]:
def get_latest_price(symbol):
    ticker = yf.Ticker(symbol)
    data = ticker.history(period="1d")  # Fetch daily data
    data = data[~data.index.duplicated(keep='last')]  # Remove any duplicate indices
    data = data.dropna(subset=['Close'])  # Drop rows with missing 'Close' values
    market_price = data['Close'].iloc[-1]
    return float(market_price)


In [7]:
def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)

In [8]:
def get_scaler(env):
  # return scikit-learn scaler object to scale the states
  # Note: you could also populate the replay buffer here

  states = []
  for _ in range(env.n_step):
    action = np.random.choice(env.action_space)
    state, reward, done, info = env.step(action)
    states.append(state)
    if done:
      break

  scaler = StandardScaler()
  scaler.fit(states)
  return scaler

In [9]:
def play_one_episode(agent, env, is_train):
  # note: after transforming states are already 1xD
  state = env.reset()
  state = scaler.transform([state])
  done = False

  while not done:
    action = agent.act(state)
    next_state, reward, done, info = env.step(action)
    next_state = scaler.transform([next_state])
    if is_train == 'train':
      agent.update_replay_memory(state, action, reward, next_state, done)
      agent.replay(batch_size)
    state = next_state

  return info['cur_val']

In [10]:
def mlp(input_dim, n_action, n_hidden_layers=1, hidden_dim=32):
  """ A multi-layer perceptron """

  # input layer
  i = Input(shape=(input_dim,))
  x = i

  # hidden layers
  for _ in range(n_hidden_layers):
    x = Dense(hidden_dim, activation='relu')(x)

  # final layer
  x = Dense(n_action)(x)

  # make the model
  model = Model(i, x)

  model.compile(loss='mse', optimizer='adam')
  print((model.summary()))
  return model

In [11]:
### The experience replay memory ###
class ReplayBuffer:
  def __init__(self, obs_dim, act_dim, size):
    self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.acts_buf = np.zeros(size, dtype=np.uint8)
    self.rews_buf = np.zeros(size, dtype=np.float32)
    self.done_buf = np.zeros(size, dtype=np.uint8)
    self.ptr, self.size, self.max_size = 0, 0, size

  def store(self, obs, act, rew, next_obs, done):
    self.obs1_buf[self.ptr] = obs
    self.obs2_buf[self.ptr] = next_obs
    self.acts_buf[self.ptr] = act
    self.rews_buf[self.ptr] = rew
    self.done_buf[self.ptr] = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.size+1, self.max_size)

  def sample_batch(self, batch_size=32):
    idxs = np.random.randint(0, self.size, size=batch_size)
    return dict(s=self.obs1_buf[idxs],
                s2=self.obs2_buf[idxs],
                a=self.acts_buf[idxs],
                r=self.rews_buf[idxs],
                d=self.done_buf[idxs])

## EVM for agent trading on alpaca

In [12]:
class EvnAlpaca:
    """
    A 3-stock trading environment.
    State: vector of size 7 (n_stock * 2 + 1)
        - # shares of stock 1 owned
        - # shares of stock 2 owned
        - # shares of stock 3 owned
        - price of stock 1 (using daily close price)
        - price of stock 2
        - price of stock 3
        - cash owned (can be used to purchase more stocks)
    Action: categorical variable with 27 (3^3) possibilities
        - for each stock, you can:
        - 0 = sell
        - 1 = hold
        - 2 = buy
    """

    def __init__(self, symbols, initial_investment):
        # data
        self.symbols = symbols
        self.n_stock = len(symbols)
        self.n_step = 25
        # instance attributes
        self.initial_investment = initial_investment
        self.cur_step = None
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = np.zeros(self.n_stock)
        self.cash_in_hand = get_cash()

        self.action_space = np.arange(3**self.n_stock)

        # action permutations
        # returns a nested list with elements like:
        # [0,0,0]
        # [0,0,1]
        # [0,0,2]
        # [0,1,0]
        # [0,1,1]
        # etc.
        # 0 = sell
        # 1 = hold
        # 2 = buy
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))

        # calculate size of state
        self.state_dim = self.n_stock * 2 + 1

        self.reset()


    def reset(self):
        self.cur_step = 0
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = self._get_stock_prices()
        self.cash_in_hand = get_cash()
        return self._get_obs()


    def step(self, action):
        assert action in self.action_space

        # get current value before performing the action
        prev_val = self._get_val()

        # update price, i.e. go to the next hour

        current_time = datetime.datetime.now()
        next_hour = current_time.replace(minute=0, second=0, microsecond=0) + datetime.timedelta(hours=1)
        time_to_wait = (next_hour - current_time).total_seconds()
        print("Waiting for next hour to start. Time to wait: ", time_to_wait)
        time.sleep(time_to_wait)
        
        self.cur_step += 1
        self.stock_price = self._get_stock_prices()


        # perform the trade
        self._trade(action)

        # get the new value after taking the action
        cur_val = self._get_val()

        # reward is the increase in portfolio value
        reward = cur_val - prev_val

        # done if we have run out of data
        done = self.cur_step == self.n_step - 1

        # store the current value of the portfolio here
        info = {'cur_val': cur_val}

        # conform to the Gym API
        return self._get_obs(), reward, done, info


    def _get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_stock] = self.stock_owned
        obs[self.n_stock:2*self.n_stock] = self.stock_price
        obs[-1] = self.cash_in_hand
        return obs


    def _get_val(self):
        val = get_portfolio_value()
        return val

    def _trade(self, action):
        action_vec = self.action_list[action]

        sell_index = []
        buy_index = []
        for i, a in enumerate(action_vec):
            if a == 0:
                sell_index.append(i)
            elif a == 2:
                buy_index.append(i)

        if sell_index:
            for i in sell_index:
                qty = int(self.stock_owned[i])
                if qty > 0:
                    symbol = self.symbols[i]
                    print("self.cash_in_hand: ", get_cash(), "self.stock_price[i]: ", self.stock_price[i])
                    sell(symbol, qty)
                    self.stock_owned[i] = 0

        if buy_index:
            can_buy = True
            while can_buy:
                for i in buy_index:
                    symbol = self.symbols[i]
                    qty = 1
                    self.cash_in_hand = get_cash()
                    if self.cash_in_hand > self.stock_price[i]:
                        print("self.cash_in_hand: ", get_cash(), "self.stock_price[i]: ", self.stock_price[i])
                        buy(symbol, qty)
                        self.stock_owned[i] = get_qty_avaiable(symbol)
                        self.cash_in_hand = get_cash()
                    else:
                        can_buy = False


    def _get_stock_prices(self):
        prices = []
        crypto_symbol = ['BTC-USD', 'ETH-USD', 'AVAX-USD']
        for symbol in crypto_symbol:
            market_price = get_latest_price(symbol)
            prices.append(market_price)
        return np.array(prices)

## DQNAgent

In [13]:
class DQNAgent(object):
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplayBuffer(state_size, action_size, size=500)
    self.gamma = 0.95  # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.model = mlp(state_size, action_size)


  def update_replay_memory(self, state, action, reward, next_state, done):
    self.memory.store(state, action, reward, next_state, done)


  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return np.random.choice(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action


  def replay(self, batch_size=32):
    # first check if replay buffer contains enough data
    if self.memory.size < batch_size:
      return

    # sample a batch of data from the replay memory
    minibatch = self.memory.sample_batch(batch_size)
    states = minibatch['s']
    actions = minibatch['a']
    rewards = minibatch['r']
    next_states = minibatch['s2']
    done = minibatch['d']

    # Calculate the tentative target: Q(s',a)
    target = rewards + (1 - done) * self.gamma * np.amax(self.model.predict(next_states), axis=1)

    # With the Keras API, the target (usually) must have the same
    # shape as the predictions.
    # However, we only need to update the network for the actions
    # which were actually taken.
    # We can accomplish this by setting the target to be equal to
    # the prediction for all values.
    # Then, only change the targets for the actions taken.
    # Q(s,a)
    target_full = self.model.predict(states)
    target_full[np.arange(batch_size), actions] = target

    # Run one training step
    self.model.train_on_batch(states, target_full)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def load(self, name):
    self.model.load_weights(name)


  def save(self, name):
    self.model.save_weights(name)

# Main

In [14]:
# config
models_folder = 'rl_trader_models'
rewards_folder = 'rl_trader_rewards'
num_episodes = 10
batch_size = 32
symbols = ['BTCUSD', 'ETHUSD', 'AVAXUSD']
initial_investment = get_cash()
print(f"Starting with ${initial_investment:.2f}")
args = "test"

maybe_make_dir(models_folder)
maybe_make_dir(rewards_folder)

# data = get_data()
# n_timesteps, n_stocks = data.shape

# n_train = n_timesteps // 2

# train_data = data[:n_train]
# test_data = data[n_train:]
env = EvnAlpaca(symbols, initial_investment)
state_size = env.state_dim
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)


# store the final value of the portfolio (end of episode)
portfolio_value = []

if args == 'test':
    # then load the previous scaler
    with open(f'{models_folder}/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    # remake the env with test data 
    env = EvnAlpaca(symbols, initial_investment)
    scaler = get_scaler(env)

    # make sure epsilon is not 1!
    # no need to run multiple episodes if epsilon = 0, it's deterministic
    agent.epsilon = 0.01

    # load trained weights
    agent.load(f'{models_folder}/dqn.h5')

# play the game num_episodes times
for e in range(num_episodes):
    t0 = datetime.now()
    val = play_one_episode(agent, env, args)
    dt = datetime.now() - t0
    print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
    portfolio_value.append(val) # append episode end portfolio value

# save portfolio value for each episode
np.save(f'{rewards_folder}/{args}.npy', portfolio_value)

Starting with $10000.00
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 7)]               0         
                                                                 
 dense (Dense)               (None, 32)                256       
                                                                 
 dense_1 (Dense)             (None, 27)                891       
                                                                 
Total params: 1,147
Trainable params: 1,147
Non-trainable params: 0
_________________________________________________________________
None


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Waiting for next hour to start. Time to wait:  315.111982
Waiting for next hour to start. Time to wait:  3595.496025
