# Q-Learning with PyTorch

In [1]:
# Required modules
!pip install gym torch

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/d4/22/4ff09745ade385ffe707fb5f053548f0f6a6e7d5e98a2b9d6c07f5b931a7/gym-0.10.9.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 14.3MB/s 
[?25hCollecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/49/0e/e382bcf1a6ae8225f50b99cc26effa2d4cc6d66975ccf3fa9590efcbedce/torch-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (519.5MB)
[K    100% |████████████████████████████████| 519.5MB 25kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x5a334000 @  0x7f7d55e522a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0

In [0]:
# Required imports
import gym
import torch
from torch import nn, optim
import numpy as np

In [3]:
# Create the env
env_name = 'FrozenLake-v0'
env = gym.make(env_name)

  result = entry_point.load(False)


In [4]:
# Get the action spaces and observation space sizes
action_size = env.action_space.n
observation_size = env.observation_space.n

print ('Action Space Sizes {}'.format(action_size),
       'Observation Space Sizes {}'.format(observation_size))

Action Space Sizes 4 Observation Space Sizes 16


In [0]:
# Model Hyperparamters
learning_rate = 0.1
max_steps = 99
total_episodes = 10000
gamma = 0.99

# Exploration parameters
epsilon = 1.0
min_epsilon = 0.01
max_epsilon = 1.0
decay_rate = 0.01

show_episode = False
interval = 500

In [0]:
# Create the network
class Q_Network(nn.Module):
    
    def __init__(self, action_size, observation_size):
        super().__init__()
        self.action_size = action_size
        self.observation_size = observation_size
        self.fc1 = nn.Linear(self.observation_size, self.action_size)
        
    def forward(self, x):
        x = torch.eye(self.observation_size)[x:x+1]
        
        pouts = self.fc1(x)
        action = int(torch.argmax(pouts))
        
        return pouts, action

In [0]:
# Get the model and define the loss and optimizer
qnet = Q_Network(action_size, observation_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(qnet.parameters(), lr=learning_rate)

In [18]:
# Training the network

reward_list = []

for episode in range(total_episodes):
    
    state = env.reset()
    done = False
    freward = 0
    
    for step in range(max_steps):
        
        qnet.zero_grad()
        
        pouts, action = qnet(state)
        
        with torch.no_grad():
            
            exp_tradeoff = np.random.rand(1)[0]
            if exp_tradeoff < epsilon:
                action = env.action_space.sample()
                
            new_state, rewards, done, info = env.step(action)
            
            freward += rewards
            
            # Getting qvals for the next state
            pouts, _ = qnet(new_state)
            
            # Getting the targetQ for the current state
            max_next_qvals = np.max(pouts)
            targetq = pouts
            targetq[0, action] = rewards + (gamma * max_next_qvals)
            
        loss = criterion(pouts, targetq)
        loss.backward()
        optimizer.step()
        
        if done:
            break
            
        state = new_state
    
    reward_list.append(freward)
    
    if show_episode:
        env.render()
    
    if episode % interval == 0:
        print ('Episode {}/{}'.format(episode, total_episodes),
               'Steps: {}'.format(step),
               'Reward: {}'.format(freward))
        
print ('Agent Trained Successfully!!')
print ('Successful Episodes: {}/{}'.format(sum(reward_list), total_episodes))

TypeError: ignored

In [0]:
# Testing Hyperparamters
total_test_episodes = 100
max_steps = 99

show_test_episode = False

In [14]:
# Testing the network

test_reward_list = []

with torch.no_grad():
    
    for episode in range(total_test_episodes):
        
        state = env.reset()
        done = False
        freward = 0
        
        for step in range(max_steps):
            
            pouts, action = qnet(state)
            
            new_state, rewards, done, info = env.step(action)
            
            freward += rewards
            
            if done:
                break
                
            state = new_state
        
        print ('Episode {}/{}'.format(episode, total_episodes),
               'Steps: {}'.format(step),
               'Reward: {}'.format(freward))
        
        if show_test_episode:
            env.render()
            print ('******************')
            
print ('Successful Episodes: {}/{}'.format(sum(test_reward_list), total_test_episodes))

KeyError: ignored