In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [3]:
from itertools import product
from datetime import datetime

### Load Dataset

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/tf2.0/aapl_msi_sbux.csv')
data = df.values

### Build Environment, Agent, ReplayBuffer, and Model

In [5]:
class Environment:
    def __init__(self, data, initial_investment=20000):
        self.stock_prices_history = data
        self.initial_investment = initial_investment
        self.n_step, self.n_stocks = data.shape

        self.cur_step = None
        self.stock_owned = None
        self.cur_stock_price = None
        self.cash = None

        self.action_space = np.arange(3**self.n_stocks)
        self.action_list = list(map(list, product([0,1,2], repeat=self.n_stocks))) # 0 sell, 1 hold, 2 buy 
        self.state_dim = self.n_stocks * 2 + 1

        self.reset()
    
    def step(self, action):
        assert action in self.action_space

        prev_val = self.get_current_value()
        self.cur_step += 1
        self.cur_stock_price = self.stock_prices_history[self.cur_step]

        self.trade(action)

        cur_val = self.get_current_value()

        s_next = self.get_obs()
        reward = cur_val - prev_val
        done = self.is_terminal()
        info = {'cur_val': cur_val}

        return s_next, reward, done, info

    def trade(self, action):
        action_vec = self.action_list[action]
        # sell
        for i in range(len(self.stock_owned)):
            if action_vec[i] == 0 and self.stock_owned[i] > 0:
                self.cash += self.stock_owned[i] * self.cur_stock_price[i]
                self.stock_owned[i] = 0
        
        # buy
        can_buy = (2 in action_vec) and (self.cash >= min(self.cur_stock_price))
        while can_buy:
            for i in range(len(self.stock_owned)):
                if self.cur_stock_price[i] > self.cash:
                    can_buy = False
                if can_buy and action_vec[i] == 2 and self.cash >= self.cur_stock_price[i]:
                    self.cash -= self.cur_stock_price[i]
                    self.stock_owned[i] += 1

    def get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_stocks] = self.stock_owned
        obs[self.n_stocks:2*self.n_stocks] = self.cur_stock_price
        obs[-1] = self.cash
        return obs
    
    def reset(self):
        self.cur_step = 0
        self.stock_owned = np.zeros(self.n_stocks)
        self.cash = self.initial_investment
        self.cur_stock_price = self.stock_prices_history[self.cur_step]
        return self.get_obs()
    
    def get_current_value(self):
        return self.stock_owned.dot(self.cur_stock_price) + self.cash
    
    def is_terminal(self):
        return self.cur_step == self.n_step-1

In [6]:
class ReplayBuffer:
    def __init__(self, obs_dim, action_dim, size):
        self.obs_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.obs_next_buf = np.zeros([size, obs_dim], dtype=np.float32)
        self.actions_buf = np.zeros(size, dtype=np.uint8)
        self.rewards_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.uint8)
        self.ptr, self.size, self.max_size = 0, 0, size
    
    def store(self, obs, action, reward, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.obs_next_buf[self.ptr] = next_obs
        self.actions_buf[self.ptr] = action
        self.rewards_buf[self.ptr] = reward
        self.done_buf[self.ptr] = done

        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)
    
    def sampling_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return {
            'states': self.obs_buf[idxs],
            'next_states': self.obs_next_buf[idxs],
            'actions': self.actions_buf[idxs],
            'rewards': self.rewards_buf[idxs],
            'done': self.done_buf[idxs]
        }

In [7]:
def nn_model(state_size, action_size, n_hidden_layers=1, hidden_dim=128):
    i = Input(shape=(state_size,))
    x = i

    for _ in range(n_hidden_layers):
        x = Dense(hidden_dim, activation='relu')(x)
    
    x = Dense(action_size)(x) # regression, no activation function

    m = Model(i, x)
    m.compile(loss='mse', optimizer='adam')
    print(m.summary())
    return m

In [8]:
class Agent(object):
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayBuffer(state_size, action_size, size=500)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = nn_model(state_size, action_size, n_hidden_layers=1)

    def update_replay_memory(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)
    
    def act(self, state):
        if np.random.randn() <= self.epsilon:
            return np.random.choice(self.action_size)
        return self.model.predict(state)[0].argmax()

    def train(self, batch_size=32):
        if self.memory.size < batch_size:
            return
        
        minibatch = self.memory.sampling_batch(batch_size)

        states = minibatch['states']
        next_states = minibatch['next_states']
        actions = minibatch['actions']
        rewards = minibatch['rewards']
        done = minibatch['done']

        # Calculating tentative target: Q(s', a)
        target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)
        target[done] = rewards[done] # at terminal state = reward only, value = 0

        target_full = self.model.predict(states)
        target_full[np.arange(batch_size), actions] = target

        self.model.train_on_batch(states, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load_weights(self, name):
        self.model.load_weights(name)

    def save_weights(self, name):
        self.model.save_weights(name)

### Model Train & Testing

Train / Test Split data

In [9]:
N = len(data)
train_data = data[:N//2]
test_data = data[N//2:N]

Scale data

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
def get_scaler(env):
    states = []
    for _ in range(env.n_step):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

Episode

In [12]:
def play_one_episode(agent, env, is_train=False):
    state = env.reset()
    state = scaler.transform([state])
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform([next_state])

        if is_train:
            agent.update_replay_memory(state, action, reward, next_state, done)
            agent.train(batch_size)

        state = next_state
    return info['cur_val']

Train Model

In [13]:
# config
batch_size = 32
model_name = 'dqn.h5'
episodes = 2000
initial_investment = 20000

In [14]:
env = Environment(train_data, initial_investment=initial_investment)
scaler = get_scaler(env)
state_size = env.state_dim
action_size = len(env.action_space)
agent = Agent(state_size, action_size)

portfolio_value = []

for e in range(episodes):
    start = datetime.now()
    val = play_one_episode(agent, env, is_train=True)
    diff = datetime.now() - start
    print(f'episode: {e+1}/{episodes}, episode end value: {val:.2f}, duration: {diff}')
    portfolio_value.append(val)

agent.save_weights(model_name)
np.save('train_port_val.npy', portfolio_value)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 7)]               0         
_________________________________________________________________
dense (Dense)                (None, 128)               1024      
_________________________________________________________________
dense_1 (Dense)              (None, 27)                3483      
Total params: 4,507
Trainable params: 4,507
Non-trainable params: 0
_________________________________________________________________
None
episode: 1/2000, episode end value: 24061.90, duration: 0:00:39.526829
episode: 2/2000, episode end value: 32267.87, duration: 0:00:40.329099


KeyboardInterrupt: ignored

Test Model

In [None]:
env = Environment(test_data, initial_investment=initial_investment)
agent.epsilon = 0.01
portfolio_value_test = []
for e in range(episodes):
    start = datetime.now()
    val = play_one_episode(agent, env, is_train=False)
    end = datetime.now()
    print(f'episode: {e+1}/{episodes}, episode end value: {val:.2f}, duration: {end-start:.2f}')
    portfolio_value_test.append(val)

np.save('test_port_val.npy', portfolio_value_test)

### Model Evaluation

Train

In [None]:
plt.figure(figsize=(16, 8))
plt.hist(portfolio_value, bins=100)
plt.show()

Test

In [None]:
plt.figure(figsize=(16, 8))
plt.hist(portfolio_value_test, bins=100)
plt.show()