In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
import itertools
import argparse
import re
import os
import pickle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler

In [6]:
data_names = ['NAS1001440_Close_R']

In [7]:
def get_data(data_name):
    # df = pd.read_csv('aapl_msi_sbux.csv')
    df = pd.read_csv(data_name+'.csv', names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']).drop(columns=['Date', 'Time', 'Open', 'High', 'Low', 'Volume'])
    return df.values

In [8]:
def get_scaler(env):
    states = []
    for _ in range(env.n_step):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler 

In [9]:
def maybe_make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

### Model

In [10]:
class LinearModel:
    def __init__(self, input_dim, n_action):
        self.W = np.random.randn(input_dim, n_action) / np.sqrt(input_dim)
        self.b = np.zeros(n_action)
    
        # Momentum terms
        self.vW = 0
        self.vb = 0

        self.losses = []
    
    def predict(self, X):
        assert(len(X.shape) == 2) # make sure X is N x D
        return X.dot(self.W) + self.b

    def sgd(self, X, Y, lr=0.01, m=0.9): # x, y, learning rate, momentum
        assert(len(X.shape) == 2)

        num_values = np.prod(Y.shape)

        Yhat = self.predict(X)
        gW = 2 * X.T.dot(Yhat - Y) / num_values
        gb = 2 * (Yhat - Y).sum(axis=0) / num_values

        self.vW = m * self.vW - lr * gW
        self.vb = m * self.vb - lr * gb

        self.W += self.vW
        self.b += self.vb

        mse = np.mean((Yhat - Y) ** 2)
        self.losses.append(mse)

    def load_weights(self, filepath):
        npz = np.load(filepath)
        self.W = npz['W']
        self.b = npz['b']
    
    def save_weights(self, filepath):
        np.savez(filepath, W=self.W, b=self.b)

In [11]:
def MLP(input_dim, n_action, n_hidden_layers=1, hidden_dim=32):
    """ A multi-layer perceptron """

    # input layer
    i = Input(shape=(input_dim,))
    x = i

    # hidden layers
    for _ in range(n_hidden_layers):
        x = Dense(hidden_dim, activation='relu')(x)

    # final layer
    x = Dense(n_action)(x)

    # make the model
    model = Model(i, x)

    model.compile(loss='mse', optimizer='adam')
    print((model.summary()))
    return model

### Environment

In [12]:
class MultiStockEnv:
    def __init__(self, data, initial_investment=20000):
        self.stock_price_history = data
        self.n_step, self.n_stock = self.stock_price_history.shape

        self.initial_investment = initial_investment
        self.cur_step = None
        self.stock_owned = None
        self.stock_price = None
        self.cash_in_hand = None

        self.action_space = np.arange(3 ** self.n_stock)

        # 0 = sell
        # 1 = hold
        # 2 = buy
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))  

        self.state_dim = self.n_stock * 2 + 1

        self.reset()

    def reset(self):
        self.cur_step = 0
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = self.stock_price_history[self.cur_step]
        self.cash_in_hand = self.initial_investment
        return self._get_obs()

    def step(self, action):
        assert(action in self.action_space)

        prev_val = self._get_val()

        self.cur_step += 1
        self.stock_price = self.stock_price_history[self.cur_step]

        self._trade(action)

        cur_val = self._get_val()
        reward = cur_val - prev_val
        done = self.cur_step == self.n_step - 1
        info = {'cur_val': cur_val}

        return self._get_obs(), reward, done, info

    def _get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_stock] = self.stock_owned
        obs[self.n_stock:2*self.n_stock] = self.stock_price
        obs[-1] = self.cash_in_hand
        return obs

    def _get_val(self):
        return self.stock_owned.dot(self.stock_price) + self.cash_in_hand

    def _trade(self, action):
        # 0 = sell
        # 1 = hold
        # 2 = buy

        action_vec = self.action_list[action]

        sell_index = [] # stock's index to sell
        buy_index = [] # stock's index to buy

        for i, a in enumerate(action_vec):
            if a == 0:
                sell_index.append(i)
            elif a == 2:
                buy_index.append(i)
        
        if sell_index: # sell all share of the stock
            for i in sell_index:
                self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
                self.stock_owned[i] = 0
        
        if buy_index: # buy each stock 1 share until can't buy anymore
            can_buy = True
            while can_buy:
                for i in buy_index:
                    if self.cash_in_hand > self.stock_price[i]:
                        self.stock_owned[i] += 1
                        self.cash_in_hand -= self.stock_price[i]
                    else:
                        can_buy = False

In [13]:
class BinaryTradeEnv:
    def __init__(self, data, initial_investment=10000):
        self.stock_price_history_diff = data
        self.n_step, self.n_stock = self.stock_price_history_diff.shape

        self.initial_investment = initial_investment
        self.cur_step = None
        self.stock_opened_buy = None
        self.stock_opened_sell = None
        self.stock_price_diff = None
        self.cash_in_hand = None

        self.risk_ratio = 0.005
        self.risk_to_trade = self.risk_ratio * self.initial_investment
        self.return_ratio = 0.9

        self.action_space = np.arange(3 ** self.n_stock)

        # 0 = sell
        # 1 = hold
        # 2 = buy
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))  

        self.state_dim = self.n_stock * 3 + 1

        self.reset()

    def reset(self):
        self.cur_step = 0
        self.stock_opened_buy = np.zeros(self.n_stock)
        self.stock_opened_sell = np.zeros(self.n_stock)
        self.stock_price_diff = self.stock_price_history_diff[self.cur_step]
        self.cash_in_hand = self.initial_investment
        return self._get_obs()

    def step(self, action):
        assert(action in self.action_space)

        prev_val = self._get_val()

        self.cur_step += 1
        self.stock_price_diff = self.stock_price_history_diff[self.cur_step]

        self._trade(action)

        cur_val = self._get_val()
        reward = cur_val - prev_val
        done = self.cur_step == self.n_step - 1
        info = {'cur_val': cur_val}

        return self._get_obs(), reward, done, info

    def _get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_stock] = self.stock_opened_buy
        obs[self.n_stock:2*self.n_stock] = self.stock_opened_sell
        obs[2*self.n_stock:3*self.n_stock] = self.stock_price_diff
        obs[-1] = self.cash_in_hand
        return obs

    def _get_val(self):
        total_opened_size = 0
        for i in range(len(self.stock_opened_buy)):
            total_opened_size += self.stock_opened_buy[i]
            total_opened_size += self.stock_opened_sell[i]

        return self.cash_in_hand + total_opened_size

    def _trade(self, action):
        # 0 = sell
        # 1 = hold
        # 2 = buy

        # close prev position
        for i in range(len(self.stock_opened_buy)):
            if self.stock_price_diff > 0:
                self.cash_in_hand += (1+self.return_ratio) * self.stock_opened_buy[i]
            elif self.stock_price_diff < 0:
                self.cash_in_hand += (1+self.return_ratio) * self.stock_opened_sell[i]
            else:
                self.cash_in_hand += self.stock_opened_buy[i] + self.stock_opened_sell[i]
            self.stock_opened_buy[i] = 0
            self.stock_opened_sell[i] = 0

        open_size = self.risk_ratio * self._get_val()
        if open_size > self.risk_to_trade:
            self.risk_to_trade = open_size
        elif open_size < 1:
            self.risk_to_trade = 1

        action_vec = self.action_list[action]

        sell_index = [] # stock's index to sell
        buy_index = [] # stock's index to buy

        for i, a in enumerate(action_vec):
            if a == 0:
                sell_index.append(i)
            elif a == 2:
                buy_index.append(i)
        
        can_open = True
        # open using 1% of portfolio value

        if sell_index:
            for i in sell_index:
                if self.cash_in_hand < 1:
                    can_open = False

                if can_open:
                    self.cash_in_hand -= self.risk_to_trade
                    self.stock_opened_sell[i] = self.risk_to_trade
        
        if buy_index:
            for i in buy_index:
                if self.cash_in_hand < 1:
                    can_open = False

                if can_open:
                    self.cash_in_hand -= self.risk_to_trade
                    self.stock_opened_buy[i] = self.risk_to_trade

In [14]:
class ForexTradeEnv:
    def __init__(self, data, initial_investment=20000):
        self.stock_price_history_diff = data
        self.n_step, self.n_stock = self.stock_price_history_diff.shape

        self.initial_investment = initial_investment
        self.cur_step = None
        self.stock_opened_buy = None
        self.stock_opened_sell = None

        self.stock_price_diff = None
        self.cash_in_hand = None
        self.lot = 0.01

        self.action_space = np.arange(4 ** self.n_stock) # 4 actions

        # 0 = sell
        # 1 = hold
        # 2 = buy
        # 3 = close
        self.action_list = list(map(list, itertools.product([0, 1, 2, 3], repeat=self.n_stock)))  

        self.state_dim = self.n_stock * 3 + 1

        self.reset()

    def reset(self):
        self.cur_step = 0
        self.stock_opened_buy = np.zeros(self.n_stock)
        self.stock_opened_sell = np.zeros(self.n_stock)
        self.stock_price_diff = self.stock_price_history_diff[self.cur_step]
        self.cash_in_hand = self.initial_investment
        return self._get_obs()

    def step(self, action):
        assert(action in self.action_space)

        prev_val = self._get_val()

        self.cur_step += 1
        self.stock_price_diff = self.stock_price_history_diff[self.cur_step]

        self._trade(action)

        cur_val = self._get_val()
        reward = cur_val - prev_val
        done = self.cur_step == self.n_step - 1
        info = {'cur_val': cur_val}

        return self._get_obs(), reward, done, info

    def _get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_stock] = self.stock_opened_buy
        obs[self.n_stock:2*self.n_stock] = self.stock_opened_sell
        obs[2*self.n_stock:3*self.n_stock] = self.stock_price_diff
        obs[-1] = self.cash_in_hand
        return obs

    def _get_val(self):
        total_opened = 0
        for i in range(len(self.stock_opened_buy)):
            total_opened += self.stock_opened_buy[i] * self.stock_price_diff[i] * self.lot * 10
            total_opened += (-self.stock_opened_sell[i]) * self.stock_price_diff[i] * self.lot * 10
        
        return total_opened + self.cash_in_hand

    def _trade(self, action):
        # 0 = sell
        # 1 = hold
        # 2 = buy
        # 3 = close

        action_vec = self.action_list[action]

        sell_index = [] # stock's index to sell
        buy_index = [] # stock's index to buy
        close_index = [] # stock's index to close

        for i, a in enumerate(action_vec):
            if a == 0:
                sell_index.append(i)
            elif a == 2:
                buy_index.append(i)
            elif a == 3:
                close_index.append(i)

        if close_index: # close all opened position
            for i in close_index:
                self.cash_in_hand += self.stock_opened_buy[i] * self.stock_price_diff[i] * self.lot * 10
                self.cash_in_hand += (-self.stock_opened_sell[i]) * self.stock_price_diff[i] * self.lot * 10
                self.stock_opened_buy[i] = 0
                self.stock_opened_sell[i] = 0

        can_open = True
        if self.cash_in_hand < 50: # minimum balance to trade
            can_open = False
        
        if sell_index: # open sell position
            for i in sell_index:
                if can_open:
                    self.stock_opened_sell[i] += 1

                if self.cash_in_hand < 50:
                    can_open = False
        
        if buy_index: # open buy position
            for i in buy_index:
                if can_open:
                    self.stock_opened_buy[i] += 1
                    
                if self.cash_in_hand < 50:
                    can_open = False

### Agent

In [15]:
class DQNAgent(object):
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = MLP(state_size, action_size)

    def act(self, state):
        if np.random.randn() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def train(self, state, action, reward, next_state, done):
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.amax(self.model.predict(next_state), axis=1)
        
        target_full = self.model.predict(state)
        target_full[0, action] = target

        self.model.train_on_batch(state, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [16]:
def play_one_episode(agent, env, is_train, scaler):
    state = env.reset()
    state = scaler.transform([state])
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform([next_state])

        if is_train == 'train':
            agent.train(state, action, reward, next_state, done)
        state = next_state

    return info['cur_val']        

### Run

In [17]:
def run_rl_trader(data_name, mode, n_percent_train, initial_investment=20000, num_episodes=2000, batch_size=32):
    models_folder = 'linear_rl_trader_models'
    rewards_folder = 'linear_rl_trader_rewards'
    # num_episodes = 2000
    # batch_size = 32
    # initial_investment = 20000

    # parser = argparse.ArgumentParser()
    # parser.add_argument('-m', '--mode', type=str, required=True, help='either "train" or "test"')

    # args = parser.parse_args()

    maybe_make_dir(models_folder)
    maybe_make_dir(rewards_folder)

    data = get_data(data_name)
    n_timesteps, n_stocks = data.shape

    n_train = n_timesteps * n_percent_train // 100

    train_data = data[:n_train]
    test_data = data[n_train:]

    env = ForexTradeEnv(train_data, initial_investment)
    state_size = env.state_dim
    action_size = len(env.action_space)
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)

    portfolio_value = []

    if mode == 'test':
        with open(f'{models_folder}/scaler-{data_name}-{n_percent_train}.pkl', 'rb') as f:
            scaler = pickle.load(f)
        
        env = ForexTradeEnv(test_data, initial_investment)

        agent.epsilon = 0.01

        agent.load(f'{models_folder}/linear-{data_name}-{n_percent_train}.h5')

    for e in range(num_episodes):
        t0 = datetime.now()
        val = play_one_episode(agent, env, mode, scaler)
        dt = datetime.now() - t0
        if e % 100 == 0:
            print(f"{data_name} episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
        portfolio_value.append(val)

    if mode == 'train':
        agent.save(f'{models_folder}/linear-{data_name}-{n_percent_train}.h5')
        with open(f'{models_folder}/scaler-{data_name}-{n_percent_train}.pkl', 'wb') as f:
            pickle.dump(scaler, f)

        # plt.plot(agent.model.losses)
        # plt.show()

    np.save(f'{rewards_folder}/{mode}-{data_name}-{n_percent_train}.npy', portfolio_value)
    a = np.load(f'linear_rl_trader_rewards/{mode}-{data_name}-{n_percent_train}.npy')

    num_loss = len([n for n in portfolio_value if n <= initial_investment])
    num_profit = len([n for n in portfolio_value if n > initial_investment])

    prob_loss = num_loss/(num_loss+num_profit)
    prob_profit = num_profit/(num_loss+num_profit)

    return np.mean(agent.model.losses), a, prob_loss, prob_profit

In [18]:
n_percent_train = 50

In [19]:
# train
train_results = []
for data_name in data_names:
    mode = 'train'
    
    avg_loss, reward, prob_loss, prob_profit = run_rl_trader(data_name, mode, n_percent_train)

    result = {
        'data_name': data_name,
        'average_reward': f'{reward.mean():.2f}, min: {reward.min():.2f}, max: {reward.max():.2f}',
        'prob_loss': f'{prob_loss:.2f}',
        'prob_profit': f'{prob_profit:.2f}',
        'reward': reward
    }

    train_results.append(result)

FileNotFoundError: ignored

In [None]:
for result in train_results:
    for k, v in result.items():
        if k == 'reward':
            plt.figure(figsize=(16, 8))
            plt.hist(v, bins=200)
            plt.show()
        else:
            print(f'{k}: {v}')

In [None]:
# test
test_results = []
for data_name in data_names:
    mode = 'test'
    
    avg_loss, reward, prob_loss, prob_profit = run_rl_trader(data_name, mode, n_percent_train)

    result = {
        'data_name': data_name,
        'average_reward': f'{reward.mean():.2f}, min: {reward.min():.2f}, max: {reward.max():.2f}',
        'prob_loss': f'{prob_loss:.2f}',
        'prob_profit': f'{prob_profit:.2f}',
        'reward': reward
    }

    test_results.append(result)

In [None]:
for result in test_results:
    for k, v in result.items():
        if k == 'reward':
            plt.figure(figsize=(16, 8))
            plt.hist(v, bins=200)
            plt.show()
        else:
            print(f'{k}: {v}')

In [22]:
agent = DQNAgent(7, 3)
agentb = DQNAgent(9, 3)

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 7)]               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                256       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 99        
Total params: 355
Trainable params: 355
Non-trainable params: 0
_________________________________________________________________
None
Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                320       
_____________________________

In [23]:
x = {}
x['q'] = {}
x['q']['a'] = agent
x['q']['b'] = agentb