<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 02 &mdash; Deep Q-Learning**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## CartPole

### The Game Environment 

In [None]:
import gymnasium as gym

In [None]:
env = gym.make('CartPole-v1')

In [None]:
env.action_space

In [None]:
env.action_space.n

In [None]:
[env.action_space.sample() for _ in range(10)]

In [None]:
env.observation_space

In [None]:
env.observation_space.shape

In [None]:
env.reset(seed=100)
# cart position, cart velocity, pole angle, pole angular velocity

In [None]:
env.step(0)

In [None]:
env.step(1)

In [None]:
class RandomAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

In [None]:
ra = RandomAgent()

In [None]:
ra.play(15)

In [None]:
ra.trewards

In [None]:
round(sum(ra.trewards) / len(ra.trewards), 2)

In [None]:
import os
import random
import warnings
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

In [None]:
warnings.simplefilter('ignore')
os.environ['PYTHONHASHSEED'] = '0'
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
lr = 0.005

In [None]:
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)

In [None]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.9975
        self.epsilon_min = 0.1
        self.memory = deque(maxlen=2000)
        self.batch_size = 32
        self.gamma = 0.9
        self.trewards = []
        self.max_treward = 0
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self._create_model().to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
    def _create_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, self.action_size)
        )
        return model

In [None]:
class DQLAgent(DQLAgent):
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state = torch.FloatTensor(state).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()
    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        states, actions, next_states, rewards, dones = zip(*batch)
        states = torch.FloatTensor(states).to(self.device).squeeze(1)
        next_states = torch.FloatTensor(next_states).to(self.device).squeeze(1)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        q_values = self.model(states)
        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.model(next_states).max(1)[0]
        expected_q_value = rewards + self.gamma * next_q_values * (1 - dones)
        loss = self.criterion(q_value, expected_q_value.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
class DQLAgent(DQLAgent):
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                self.memory.append([state, action, next_state, reward, done])
                state = next_state
                if done or trunc:
                    self.trewards.append(f)
                    self.max_treward = max(self.max_treward, f)
                    templ = f'episode={e:4d} | treward={f:4d}'
                    templ += f' | max={self.max_treward:4d}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
        print()

In [None]:
class DQLAgent(DQLAgent):
    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            for f in range(1, 5001):
                state_tensor = torch.FloatTensor(state).to(self.device)
                with torch.no_grad():
                    q_values = self.model(state_tensor)
                action = torch.argmax(q_values).item()
                state, reward, done, trunc, _ = self.env.step(action)
                state = np.reshape(state, [1, self.state_size])
                if done or trunc:
                    print(f, end=' ')
                    break

In [None]:
agent = DQLAgent()

In [None]:
%time agent.learn(2500)

In [None]:
agent.epsilon

In [None]:
agent.test(15)

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>