# Reinforcement Learning for Stock Market Trader

In [1]:
import math
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import pandas_datareader as data_reader

from tqdm import tqdm_notebook, tqdm
from collections import deque

In [2]:
tf.__version__

'2.0.0-beta1'

# Building the AI Trader Network

- action_space = 3 : Stay, Buy, Sell
- state 는 지나간 날들과 그 날의 주가 이다.  
- loss 를 mse 를 사용할 것이므로 linear activation 사용 
    - we will modify our actions with our rewards which is a continuous number and not a class

In [15]:
class AI_Trader:
    
    def __init__(self, state_size, action_space=3, model_name="AITrader"):
    
        self.state_size = state_size
        self.action_space = action_space
        self.memory = deque(maxlen=2000)      # for memory replay
        self.inventory = []                                  # stock inventory
        self.model_name = model_name

        self.gamma = 0.95
        self.epsilon = 1.0                                  # 처음에는 100% random
        self.epsilon_final = 0.01                        
        self.epsilon_decay = 0.995                     # how fast epsilon decays

        self.model = self.model_builder()
        
    def model_builder(self):
        model = tf.keras.models.Sequential()
        model.add(Dense(units=32, activation='relu', input_dim=self.state_size))
        model.add(Dense(units=64, activation='relu'))
        model.add(Dense(units=128, activation='relu'))
        model.add(Dense(units=self.action_space, activation='linear'))     # loss 를 mse 를 사용할 것이므로 linear activation 사용
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))
        return model
        
    def trader(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_space)   # 0, 1, 2  : compleately random policy
        
        actions = self.model.predict(state)
        return np.argmax(actions[0])
    
    # Custom Trading function
    def batch_train(self, batch_size):
        batch = []
        for i in range(len(self.memory) - batch_size + 1, len(self.memory)):  # memory 뒤에서 batch_size 개 만큼부터 사용
            batch.append(self.memory[i])
            
        for state, action, reward, next_state, done in batch:
            reward = reward   # if terminal state
            if not done:         
                reward = reward + self.gamma * np.amax(self.model.predict(next_state)[0])   #amax - maximum value
                
            target = self.model.predict(state)
            target[0][action] = reward
            
            self.model.fit(state, target, epochs=1, verbose=0)
            
            if self.epsilon > self.epsilon_final:
                self.epsilon *= self.epsilon_decay

## Helper functions

- Sigmoid

    - 주가가 200 에서 1000 으로 뛰는 것과 40 에서 200 으로 뛰는 것을 같은 차이로 scale 하기 위해 사용
    
- stock_price_format
    - 주식을 사고, 팔때의 가격 formatting

In [16]:
def sigmoid(x):
    return 1 / (1+math.exp(-x))

In [17]:
def stocks_price_format(n):
    if n < 0:
        return "- $ {0:2f}".format(abs(n))
    else:
        return " $ {0:2f}".format(abs(n))

### Yahoo Finance 
https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch

In [18]:
dataset = data_reader.DataReader("AAPL", data_source="yahoo")
dataset.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-12-31,30.478571,30.08,30.447144,30.104286,88102700.0,26.372231
2010-01-04,30.642857,30.34,30.49,30.572857,123432400.0,26.782711
2010-01-05,30.798571,30.464285,30.657143,30.625713,150476200.0,26.82901
2010-01-06,30.747143,30.107143,30.625713,30.138571,138040000.0,26.40226
2010-01-07,30.285715,29.864286,30.25,30.082857,119282800.0,26.35346


In [19]:
str(dataset.index[0]).split()

['2009-12-31', '00:00:00']

## data loading from yahoo

In [20]:
def dataset_loader(stock_name):
    dataset = data_reader.DataReader(stock_name, data_source="yahoo")
    start_date = str(dataset.index[0]).split()[0]
    end_date = str(dataset.index[-1]).split()[0]
    close = dataset['Close']
    return close

## state 생성자
- timestep : 현재의 time step
- window_size : 이전 몇일치 주가로 예측할지 time window

### agent 가 어떤 action 을 취하는지와 무관하게 state 는 random 하게 변화

In [21]:
def state_creator(data, timestep=0, window_size=10):
    
    starting_id = timestep - window_size + 1      # timestep 으로부터 window_size 이전 만큼에서 starting 
    
    if starting_id >=0:                                         # starting 시점이 window_size 보다 클 경우 
        windowed_data = data[starting_id:timestep+1]       # window_size 이전 부터 현재 timestep 까지의 data
    else:
        windowed_data = - starting_id * [data[0]] + list(data[0:timestep+1])    # window_size 보다 작은 data 는 시작 data 로 padding

    state = []                   
    for i in range(window_size - 1):
        state.append(sigmoid(windowed_data[i+1] - windowed_data[i]))
    
    return np.array([state])

## dataset loading

In [22]:
stock_name = "AAPL"
data = dataset_loader(stock_name)

## Training AI Trader

In [23]:
window_size = 10
episodes = 1000
batch_size = 32
data_samples = len(data) - 1

## Model define

In [24]:
trader = AI_Trader(window_size)
trader.model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 32)                352       
_________________________________________________________________
dense_5 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_6 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 387       
Total params: 11,171
Trainable params: 11,171
Non-trainable params: 0
_________________________________________________________________


## Training loop

In [None]:
for episode in range(1, episodes + 1):
    print("Episode: {}/{}".format(episode, episodes))
    
    state = state_creator(data, 0, window_size + 1)        # start 는 처음 data 부터
    
    total_profit = 0
    trader.inventory = []                                               # 처음에는 보유 stock 없음
    
    for t in tqdm(range(data_samples)):
        
        action = trader.trader(state)
        
        next_state = state_creator(data, t+1, window_size + 1)
        reward = 0
        
        if action == 1:   # Buying
            trader.inventory.append(data[t])
            print("AI trader bought: ", stocks_price_format(data[t]))
        elif action == 2 and len(trader.inventory) > 0:
            buy_price = trader.inventory.pop(0)
            reward = max(data[t] - buy_price, 0)
            total_profit += data[t] - buy_price
            print("AI trader sold: ", stocks_price_format(data[t]), "Profit: " + stocks_price_format(data[t] - buy_price) )
            
        if t == data_samples - 1:
            done = True
        else:
            done = False
            
        trader.memory.append((state, action, reward, next_state, done))
        
        state = next_state
        
        if done:
            print("########################")
            print("TOTAL PROFIT: {}".format(total_profit))
            print("########################")
        
        if len(trader.memory) > batch_size:
            trader.batch_train(batch_size)
        
        if episode % 10 == 0:
            trader.model.save("AI_trader_{}.h5".format(episode))
    