<a href="https://colab.research.google.com/github/jchen8000/GenerativeAI/blob/main/5_Fine-Tuning/Q_Learning_Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5.7 Reinforcement Learning

An example of Q-Learning algorithm

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

In [2]:
state_size = 6  # One-hot encoded state size
action_size = 2  # Two possible actions: "left" and "right"
epochs = 1000  # Number of epochs to train
gamma = 0.6  # Discount rate for future rewards
epsilon = 0.1  # Exploration rate
alpha = 0.1  # Learning rate

In [3]:
# Define the Q-network model
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc = nn.Linear(state_size, action_size)

    def forward(self, state):
        return self.fc(state)

In [4]:
net = QNetwork(state_size, action_size)
optimizer = optim.Adam(net.parameters(), lr=alpha)
loss_fn = nn.MSELoss()


In [5]:
for epoch in range(epochs):
    state = torch.zeros(state_size)  # Initialize state as a one-hot encoded tensor
    state[0] = 1  # Set the initial state to state 0
    done = False

    while not done:
        if random.random() < epsilon:
            action = random.randrange(action_size)
        else:
            q_values = net(state)
            action = torch.argmax(q_values).item()

        next_state = torch.zeros(state_size)
        if action == 1 and torch.argmax(state).item() < state_size - 1:
            next_state[torch.argmax(state).item() + 1] = 1
        else:
            next_state[torch.argmax(state).item()] = 1

        reward = 0
        next_state_max = torch.argmax(next_state).item()
        if next_state_max == state_size - 1:
            reward = 1
            done = True

        predicted_q_values = net(state)
        target_q_value = reward + gamma * torch.max(net(next_state)).detach()
        target_q_values = predicted_q_values.clone()
        target_q_values[action] = target_q_value

        optimizer.zero_grad()
        loss = loss_fn(predicted_q_values, target_q_values)
        loss.backward()
        optimizer.step()

        state = next_state
print("Training complete")

Training complete


In [6]:
torch.set_printoptions(precision=3, sci_mode=False)
# Display trained Q-values (weights)
print("\nLearned Q-values:")
for idx, param in enumerate(net.parameters()):
    if param.requires_grad:
        print("Layer {}:\n {}".format(idx, param.data))

# Derive the policy from Q-values
policy = {}
for state_idx in range(state_size):
    state = torch.zeros(state_size)
    state[state_idx] = 1
    q_values = net(state)
    action = torch.argmax(q_values).item()
    policy[state_idx] = 'right' if action == 1 else 'left'

print("\nDerived policy:")
for state in range(state_size):
    print(f"State {state}: Move {policy[state]}")


Learned Q-values:
Layer 0:
 tensor([[-1.482, -0.744, -1.178, -0.290, -0.014,  0.112],
        [-0.453, -0.300,  0.005,  0.358,  0.890, -0.096]])
Layer 1:
 tensor([0.700, 0.575])

Derived policy:
State 0: Move right
State 1: Move right
State 2: Move right
State 3: Move right
State 4: Move right
State 5: Move left
