In [1]:
import sys, os 
import gym 
import gymnasium
import numpy as np
import pandas as pd 
import gym_sudoku
import stable_baselines3

In [2]:
from gym_sudoku.envs import SudokuEnv

## Reinforcement Learning

* [Stable Baselines 3 (Pytorch)](https://github.com/DLR-RM/stable-baselines3)

In [3]:
# Trying to make it backwards compatible
env = gym.make("Sudoku-v0")
#env = gymnasium.make('Sudoku-v0', env=SudokuEnv())
#env= gym.make('Sudoku-v0',apply_api_compatibility=True)
# env.reset()

In [4]:
env.reset()

array([[5, 8, 0, 1, 0, 0, 4, 0, 0],
       [0, 7, 0, 0, 0, 0, 8, 5, 1],
       [9, 0, 1, 0, 5, 0, 2, 0, 7],
       [3, 0, 8, 6, 7, 0, 0, 0, 9],
       [0, 9, 7, 5, 0, 4, 0, 0, 2],
       [0, 6, 5, 0, 0, 1, 0, 4, 0],
       [0, 0, 3, 7, 0, 9, 1, 8, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 5],
       [7, 1, 9, 8, 0, 5, 3, 2, 0]])

In [5]:
grid = env.reset()

In [6]:
flatgrid = grid.flatten()

In [7]:
flatgrid.reshape(9,9)

array([[5, 8, 0, 1, 0, 0, 4, 0, 0],
       [0, 7, 0, 0, 0, 0, 8, 5, 1],
       [9, 0, 1, 0, 5, 0, 2, 0, 7],
       [3, 0, 8, 6, 7, 0, 0, 0, 9],
       [0, 9, 7, 5, 0, 4, 0, 0, 2],
       [0, 6, 5, 0, 0, 1, 0, 4, 0],
       [0, 0, 3, 7, 0, 9, 1, 8, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 5],
       [7, 1, 9, 8, 0, 5, 3, 2, 0]])

In [8]:
env.render()

580 | 100 | 400
070 | 000 | 851
901 | 050 | 207
---------------
308 | 670 | 009
097 | 504 | 002
065 | 001 | 040
---------------
003 | 709 | 180
000 | 000 | 005
719 | 805 | 320




In [9]:
gym.__version__

'0.21.0'

# 1D Discrete Action Space 
* 729 choices

### Deep Q MLP
stable_baselines3.dqn

In [10]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.dqn import MlpPolicy, CnnPolicy

In [11]:
from gym.spaces import Discrete, Tuple

In [94]:
#model = DQN("MlpPolicy",env, verbose=1)
model = DQN.load("dqn_1_000_000_int_array_input", print_system_info=True)

== CURRENT SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0



In [95]:
# import time
# start = time.perf_counter()
# model.learn(total_timesteps=1_000_000)
# length = time.perf_counter() - start 

In [96]:
# print(f"{length/60.0} minutes")

In [97]:
action_choices = [(i,j,k) for i in range(9) for j in range(9) for k in range(9)]
action_to_choice = dict(zip(list(range(9**3)), action_choices))

In [19]:
def score_choice(action_int,obs_arr):
    """
    Score action before action applied to observation
    """
    x,y,val = action_to_choice[action_int]
    empty_cell = False if obs_arr[x][y] != 0 else True 
    exists_row = True if val in list(obs_arr[:,y])  else False
    exists_col = True if val in list(obs_arr[x,:])  else False
    br,bc = 3*(x//3), 3*(y//3)
    exists_box = any([obs_arr[i][j] == val for i in range(br, br+3) for j in range(bc,bc+3)])
    valid_move = (empty_cell and not exists_row and not exists_col and not exists_box)

    return {
        "empty_cell": empty_cell,
        "exists_row": exists_row,
        "exists_col": exists_col,
        "exists_box": exists_box,
        "valid_move":valid_move

    }

In [82]:
obs = env.reset()
obs_orig = obs 
outputs = []
for _ in range(2000):
    action, _states = model.predict(obs)
    outputs.append(score_choice(action.item(), obs))
    obs, reward, done, info = env.step(action.item())
env.render()

049 | 431 | 210
527 | 901 | 322
067 | 528 | 003
---------------
484 | 077 | 731
179 | 551 | 884
230 | 790 | 560
---------------
150 | [91m1[0m42 | 652
792 | 557 | 913
645 | 047 | 095




In [84]:
choices_df = pd.DataFrame(outputs)

In [91]:
print(f"Percent valid moves: {choices_df['valid_move'].sum()/len(choices_df)*100}%")

Percent valid moves: 0.2%


In [76]:
# model.save(path="dqn_1_000_000_int_array_input")

### PPO MLP

In [12]:
loaded_model = PPO.load("ppo_1_000_000", print_system_info=True)

== CURRENT SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0



In [13]:
obs = env.reset()
obs_orig = obs 
outputs = []
for _ in range(2000):
    action, _states = loaded_model.predict(obs)
    outputs.append(score_choice(action.item(), obs))
    obs, reward, done, info = env.step(action.item())
env.render()

NameError: name 'score_choice' is not defined

In [93]:
choices_df = pd.DataFrame(outputs)
print(f"Percent valid moves: {choices_df['valid_move'].sum()/len(choices_df)*100}%")

Percent valid moves: 0.15%


## Multidiscrete Action space

In [14]:
model = PPO("MlpPolicy",env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [16]:
import time
start = time.perf_counter()
model.learn(total_timesteps=1_000_000)
length = time.perf_counter() - start 

-----------------------------
| time/              |      |
|    fps             | 393  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 344       |
|    iterations           | 2         |
|    time_elapsed         | 11        |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0162832 |
|    clip_fraction        | 0.161     |
|    clip_range           | 0.2       |
|    entropy_loss         | -6.57     |
|    explained_variance   | 0.224     |
|    learning_rate        | 0.0003    |
|    loss                 | 28        |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.0291   |
|    value_loss           | 115       |
---------------------------------------
-----------------------------------------
| time/                   | 

In [28]:
def score_choice(action_int,obs_arr):
    """
    Score action before action applied to observation
    """
    int_to_coordinates = dict(zip(list(range(81)),
                [(i,j) for i in list(range(9)) for j in list(range(9))]))
    x,y = int_to_coordinates[action[0]]
    val = action[1]
    empty_cell = False if obs_arr[x][y] != 0 else True 
    exists_row = True if val in list(obs_arr[:,y])  else False
    exists_col = True if val in list(obs_arr[x,:])  else False
    br,bc = 3*(x//3), 3*(y//3)
    exists_box = any([obs_arr[i][j] == val for i in range(br, br+3) for j in range(bc,bc+3)])
    valid_move = (empty_cell and not exists_row and not exists_col and not exists_box)

    return {
        "empty_cell": empty_cell,
        "exists_row": exists_row,
        "exists_col": exists_col,
        "exists_box": exists_box,
        "valid_move":valid_move

    }

In [30]:
obs = env.reset()
obs_orig = obs 
outputs = []
for _ in range(2000):
    action, _states = model.predict(obs)
    outputs.append(score_choice(action, obs))
    obs, reward, done, info = env.step(action)
env.render()

58[91m5[0m | 100 | 400
070 | 000 | 851
901 | 650 | 207
---------------
308 | 670 | 009
097 | 504 | 602
065 | 001 | 040
---------------
003 | 709 | 180
000 | 000 | 005
719 | 805 | 320




In [31]:
choices_df = pd.DataFrame(outputs)
print(f"Percent valid moves: {choices_df['valid_move'].sum()/len(choices_df)*100}%")

Percent valid moves: 0.0%


In [32]:
choices_df.head()

Unnamed: 0,empty_cell,exists_row,exists_col,exists_box,valid_move
0,True,True,True,True,False
1,False,True,True,True,False
2,False,True,True,True,False
3,False,True,True,True,False
4,False,True,True,True,False


In [18]:
model.save(path="ppo_1_000_000_multidiscrete")