# COMS W4995 Project: Reinforcement Learning Trading

### imports


In [1]:
import gym


In [2]:
def get_data(col='close'):
  """ Returns a 3 x n_step array """
  msft = pd.read_csv('./data/daily_MSFT.csv', usecols=[col])
  ibm = pd.read_csv('./data/daily_IBM.csv', usecols=[col])
  qcom = pd.read_csv('./data/daily_QCOM.csv', usecols=[col])
    
  # recent price are at top; reverse it
  return np.array([msft[col].values[::-1],
                   ibm[col].values[::-1],
                   qcom[col].values[::-1]])

## Create environment

In [3]:
import itertools

class EquityEnv(gym.Env):
    """
    A 3-stock equity trading environment.

    State: [# of stock owned, current stock prices, cash in hand]
    - array of length n_stock * 2 + 1
    - price is discretized (to integer) to reduce state space
    - use close price for each stock
    - cash in hand is evaluated at each step based on action performed

    Action: sell (0), hold (1), and buy (2)
    - when selling, sell all the shares
    - when buying, buy as many as cash in hand allows
    - if buying multiple stock, equally distribute cash in hand and then utilize the balance
    """


    def __init__(self, train_data, initial_invest):
        self.stock_price_history = np.around(train_data) # round up to integer to reduce state space
        self.n_stock, self.n_step = self.stock_price_history.shape        
        
        self.init_invest = initial_invest
        
        self.cur_step = 0
        self.stock_owned = [0] * self.n_stock
        self.stock_price = self.stock_price_history[:, self.cur_step]
        
        self.cash_in_hand = self.init_invest

        
        # action space
        self.action_space = gym.spaces.Discrete(3**self.n_stock)
#         self.action_space = gym.spaces.Box(-2, 2, shape=(asset_num,), dtype='float32')
        

    def reset(self):
        self.cur_step = 0        
        self.stock_owned = [0] * self.n_stock
        self.stock_price = self.stock_price_history[:, self.cur_step]
        
        self.cash_in_hand = self.init_invest
        
        #return self._get_obs()
        
    def step(self, action):
        previous_portfolio_value = self._get_portfolio_value()

        self.cur_step += 1
        
        self.stock_price = self.stock_price_history[:, self.cur_step] # update price        
                
        self._trade(action)
                
        next_state = self._get_observation()

        current_portfolio_value = self._get_portfolio_value()        
        reward = current_portfolio_value - previous_portfolio_value
        
        done = self.cur_step == self.n_step - 1
        
        info = {'cur_val': current_portfolio_value}
        
        return next_state, reward, done, info    
    
    def _get_observation(self):
        observation = []
        
        observation.extend(self.stock_owned)        
        observation.extend(list(self.stock_price))
        observation.append(self.cash_in_hand)     
        
        return observation  
    
    def _get_portfolio_value(self):
        
        return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand
        
 
    def _trade(self, action):
        # all combo to sell(0), hold(1), or buy(2) stocks
        action_cartesian_product = list(itertools.product([0, 1, 2], repeat=self.n_stock))

        action_vec = action_cartesian_product[action]
        
        # one pass to get sell/buy index
        sell_index = []
        buy_index = []
        for i, a in enumerate(action_vec):
          if a == 0:
            sell_index.append(i)
          elif a == 2:
            buy_index.append(i)

        # two passes: sell first, then buy; might be naive in real-world settings
        if sell_index:
          for i in sell_index:
            self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
            self.stock_owned[i] = 0
        if buy_index:
          can_buy = True
          while can_buy:
            for i in buy_index:
              if self.cash_in_hand > self.stock_price[i]:
                self.stock_owned[i] += 1 # buy one share
                self.cash_in_hand -= self.stock_price[i]
              else:
                can_buy = False



In [4]:
import numpy as np
import pandas as pd




## Load data

In [5]:
data = np.around(get_data())
train_data = data[:, :3526]
test_data = data[:, 3526:]

print (len(data[0]))
print (train_data)

4526
[[117. 113. 114. ...  36.  36.  36.]
 [116. 112. 116. ... 186. 190. 188.]
 [179. 162. 158. ...  73.  73.  74.]]


In [6]:
initial_invest = 10000

env = EquityEnv(train_data, initial_invest)


print(env.stock_price_history)


[[117. 113. 114. ...  36.  36.  36.]
 [116. 112. 116. ... 186. 190. 188.]
 [179. 162. 158. ...  73.  73.  74.]]


## Run Model


In [7]:
portfolio_value = []

total_episode = 20

for i_episode in range(total_episode):
    observation = env.reset()
    for t in range(env.n_step):
        
        action = env.action_space.sample()
        
        observation, reward, done, info = env.step(action)
        
        if done:
            print (info)
            print("Episode finished after {} timesteps".format(t+1))
            print("episode: {}/{}, episode end value: {}".format(i_episode, total_episode, info['cur_val']))
            portfolio_value.append(info['cur_val']) # append episode end portfolio value   

            break
            
env.close()

print (portfolio_value)

{'cur_val': 6764.0}
Episode finished after 3525 timesteps
episode: 0/20, episode end value: 6764.0
{'cur_val': 4234.0}
Episode finished after 3525 timesteps
episode: 1/20, episode end value: 4234.0
{'cur_val': 4436.0}
Episode finished after 3525 timesteps
episode: 2/20, episode end value: 4436.0
{'cur_val': 2059.0}
Episode finished after 3525 timesteps
episode: 3/20, episode end value: 2059.0
{'cur_val': 23203.0}
Episode finished after 3525 timesteps
episode: 4/20, episode end value: 23203.0
{'cur_val': 19395.0}
Episode finished after 3525 timesteps
episode: 5/20, episode end value: 19395.0
{'cur_val': 3572.0}
Episode finished after 3525 timesteps
episode: 6/20, episode end value: 3572.0
{'cur_val': 2853.0}
Episode finished after 3525 timesteps
episode: 7/20, episode end value: 2853.0
{'cur_val': 5003.0}
Episode finished after 3525 timesteps
episode: 8/20, episode end value: 5003.0
{'cur_val': 16540.0}
Episode finished after 3525 timesteps
episode: 9/20, episode end value: 16540.0
{'cu

## Run