In [103]:
import sys, os 
import gym 
import numpy as np
import gym_sudoku
import stable_baselines3

## Reinforcement Learning

* [Stable Baselines 3 (Pytorch)](https://github.com/DLR-RM/stable-baselines3)

In [104]:
env = gym.make('Sudoku-v0')
env.reset()

array([[9, 0, 7, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 2, 4, 0, 0, 7],
       [0, 2, 4, 6, 0, 9, 1, 5, 0],
       [5, 3, 0, 4, 0, 7, 0, 9, 0],
       [0, 4, 0, 0, 3, 0, 0, 7, 5],
       [6, 0, 0, 8, 5, 1, 3, 0, 0],
       [0, 0, 0, 0, 4, 5, 0, 3, 9],
       [0, 1, 0, 7, 9, 3, 0, 6, 0],
       [0, 9, 0, 2, 8, 0, 0, 1, 0]])

In [105]:
env.render()

907 | 010 | 000
001 | 024 | 007
024 | 609 | 150
---------------
530 | 407 | 090
040 | 030 | 075
600 | 851 | 300
---------------
000 | 045 | 039
010 | 793 | 060
090 | 280 | 010




### Simple Deep Q
stable_baselines3.dqn

In [106]:
import math, random 
import torch 
import torch.nn as nn 
import torch.optim as optim
import torch.nn.functional as F

In [107]:
import matplotlib.pyplot as plt
from collections import namedtuple, deque 
from itertools import count 

In [108]:
torch.cuda.is_available()

True

In [109]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [110]:
from IPython import display

In [111]:
plt.ion()

<contextlib.ExitStack at 0x7f1f0c1211e0>

In [112]:
# Representing a single transition in our environment
Transition = namedtuple('Transition',
    ("state", "action", "next_state", "reward")
)


In [113]:
class ReplayMemory(object):
    """
    Holds the transitions observed recently
    """ 
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self,*args):
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [114]:
class DQN(nn.Module):

    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Conv2d(input_shape[0], 32, kernel_size=4, stride=1)
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)


In [115]:
BATCH_SIZE = 128 
GAMMA = 0.99
EPS_START =0.9 # Epsilon start
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

In [116]:
env.action_space

Tuple(Discrete(9), Discrete(9), Discrete(9))

In [117]:
env.action_space[0]

Discrete(9)

In [118]:
state = env.reset()

In [193]:
env.action_space.to_jsonable

<bound method Tuple.to_jsonable of Tuple(Discrete(9), Discrete(9), Discrete(9))>

In [194]:
#n_observations =  len(state)*len(state[0]) ### CHECK ?  
n_observations = len(state.flatten())
n_actions = 27
policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)

In [188]:
type(policy_net)

__main__.DQN

In [190]:
policy_net(state.flatten()) # largest column values of each row. 

tensor([ 0.1512, -0.0050,  0.1017], device='cuda:0', grad_fn=<AddBackward0>)

In [161]:
target_net.load_state_dict(policy_net.state_dict())

<All keys matched successfully>

In [162]:
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)

In [184]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done/EPS_DECAY)
    steps_done += 1 
    if len(state.shape) > 1:
        state = state.flatten()
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max().item()
    else:
        return torch.tensor(
            env.action_space.sample(),
            device=device, dtype=torch.long)

In [185]:
episode_durations = []

def plot_durations(show_result=False):
    plt.figure()
    durations_t = torch.tensor(episode_durations, dype=torch.float)
    if show_result:
        plt.title("Result")
    else:
        plt.clf()
        plt.title("Training...")
    
    plt.xlabel("Episode")
    plt.ylabel("Duration")
    plt.plot(durations_t.numpy())

    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())
    
    plt.pause(0.001)
    display.display(plt.gcf())
    if not show_result:
        display.clear_output(wait=True)
        


### Training Loop

In [186]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return 
    
    transitions = memory.sample(BATCH_SIZE)

    # Converts batch-array of Transitions
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
        batch.next_state)), device=device, dtype=torch.bool)

    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)


    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    
    expected_state_action_values = (next_state_values * GAMMA ) + reward_batch

    criterion = nn.SmoothL1Loss()

    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimizer
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_value_(policy_net.parameters(),100)
    optimizer.step()

    

In [187]:
if torch.cuda.is_available():
    num_episodes = 600
else:
    num_episodes = 50 

for i_episode in range(num_episodes):
    state = env.reset()
    state = torch.flatten(
        torch.tensor(state, dtype=torch.float32, device=device)
    )
    print(f"Starting Episode {i_episode}")
    print(f"Starting state \n {state}")
    for t in count():
        action = select_action(state)
        observation, reward, terminated, _ = env.step(action)
        reward = torch.tensor([reward], device=device)
        done = terminated
        if terminated:
            next_state= None 
        else:
            next_state = torch.tensor(
                observation, 
                dtype=torch.float32,device=device).unsqueeze(0)

        memory.push(state, action, next_state, reward)
        state = next_state 
        optimize_model() 

        target_net_state_dict = target_net.state_dict() 
        policy_net_state_dict = policy_net.state_dict() 
        for key in policy_net_state_dict:
            target_net_state_dict[key] = (
                policy_net_state_dict[key]*TAU 
                +target_net_state_dict[key]*(1-TAU)
            )
        if done:
            episode_durations.append(t+1)
            plot_durations()
            break 
    
    plot_durations(show_result=True)
    plt.ioff()
    plt.show()

Starting Episode 0
Starting state 
 tensor([9., 0., 7., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 2., 4., 0., 0., 7.,
        0., 2., 4., 6., 0., 9., 1., 5., 0., 5., 3., 0., 4., 0., 7., 0., 9., 0.,
        0., 4., 0., 0., 3., 0., 0., 7., 5., 6., 0., 0., 8., 5., 1., 3., 0., 0.,
        0., 0., 0., 0., 4., 5., 0., 3., 9., 0., 1., 0., 7., 9., 3., 0., 6., 0.,
        0., 9., 0., 2., 8., 0., 0., 1., 0.], device='cuda:0')


TypeError: 'float' object is not subscriptable

In [None]:
policy_net(state.flatten()).max().item()

0.12645314633846283

In [147]:
action[0][0][0]

tensor(5, device='cuda:0')

In [126]:
state.squeeze((0,1))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [136]:
select_action(state)

tensor([[[8, 2, 1]]], device='cuda:0')

In [137]:
action

tensor([[[8, 8, 2]]], device='cuda:0')

In [138]:
action[0]

tensor([[8, 8, 2]], device='cuda:0')

In [139]:
action[1]

IndexError: index 1 is out of bounds for dimension 0 with size 1