<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 02 &mdash; Deep Q-Learning**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## CartPole

### The Game Environment 

In [1]:
import gymnasium as gym

In [2]:
env = gym.make('CartPole-v1')

In [3]:
env.action_space

Discrete(2)

In [4]:
env.action_space.n

2

In [5]:
[env.action_space.sample() for _ in range(10)]

[0, 1, 0, 0, 1, 0, 0, 0, 0, 1]

In [6]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [7]:
env.observation_space.shape

(4,)

In [8]:
env.reset(seed=100)
# cart position, cart velocity, pole angle, pole angular velocity

(array([ 0.03349816,  0.0096554 , -0.02111368, -0.04570484], dtype=float32),
 {})

In [9]:
env.step(0)

(array([ 0.03369127, -0.18515752, -0.02202777,  0.24024247], dtype=float32),
 1.0,
 False,
 False,
 {})

In [10]:
env.step(1)

(array([ 0.02998812,  0.01027205, -0.01722292, -0.05930644], dtype=float32),
 1.0,
 False,
 False,
 {})

In [11]:
class RandomAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

In [12]:
ra = RandomAgent()

In [13]:
ra.play(15)

In [14]:
ra.trewards

[19, 17, 10, 13, 13, 12, 35, 21, 17, 26, 16, 49, 20, 19, 26]

In [15]:
round(sum(ra.trewards) / len(ra.trewards), 2)

20.87

In [16]:
import os
import random
import warnings
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque
from keras.layers import Dense
from keras.models import Sequential

In [17]:
warnings.simplefilter('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = '0'

In [18]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [19]:
opt = keras.optimizers.legacy.Adam(learning_rate=0.0001)

In [20]:
random.seed(100)
tf.random.set_seed(100)

In [21]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.9
        self.trewards = list()
        self.max_treward = 0
        self._create_model()
        self.env = gym.make('CartPole-v1')
    def _create_model(self):
        self.model = Sequential()
        self.model.add(Dense(24, activation='relu', input_dim=4))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse', optimizer=opt)

In [22]:
class DQLAgent(DQLAgent):
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, next_state, reward, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=2, verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [23]:
class DQLAgent(DQLAgent):
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = \
                    self.env.step(action)
                next_state = np.reshape(next_state, [1, 4])
                self.memory.append(
                    [state, action, next_state, reward, done])
                state = next_state
                if done or trunc:
                    self.trewards.append(f)
                    self.max_treward = max(self.max_treward, f)
                    templ = f'episode={e:4d} | treward={f:4d}'
                    templ += f' | max={self.max_treward:4d}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
        print()

In [24]:
class DQLAgent(DQLAgent):
    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 5001):
                action = np.argmax(self.model.predict(state)[0])
                state, reward, done, trunc, _ = self.env.step(action)
                state = np.reshape(state, [1, 4])
                if done or trunc:
                    print(f, end=' ')
                    break

In [25]:
agent = DQLAgent()

In [26]:
%time agent.learn(1500)

episode=1500 | treward= 254 | max= 500
CPU times: user 2min 11s, sys: 23.2 s, total: 2min 34s
Wall time: 2min 8s


In [27]:
agent.epsilon

0.09997053357470892

In [28]:
agent.test(15)

185 211 206 101 198 234 115 287 241 116 98 201 120 174 95 

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>