In [83]:
import sys, os 
import gym 
import gymnasium
import numpy as np
import pandas as pd 
import gym_sudoku
import stable_baselines3

In [61]:
from gym_sudoku.envs import SudokuEnv

## Reinforcement Learning

* [Stable Baselines 3 (Pytorch)](https://github.com/DLR-RM/stable-baselines3)

In [62]:
# Trying to make it backwards compatible
env = gym.make("Sudoku-v0")
#env = gymnasium.make('Sudoku-v0', env=SudokuEnv())
#env= gym.make('Sudoku-v0',apply_api_compatibility=True)
# env.reset()

In [63]:
env.reset()

array([[0, 4, 9, 0, 3, 0, 2, 1, 0],
       [5, 2, 7, 9, 0, 1, 3, 0, 0],
       [0, 6, 0, 0, 2, 8, 0, 0, 0],
       [4, 8, 0, 0, 7, 0, 0, 3, 1],
       [0, 7, 0, 5, 0, 0, 8, 0, 0],
       [2, 0, 0, 0, 9, 0, 5, 6, 0],
       [1, 5, 0, 0, 0, 0, 6, 0, 2],
       [7, 9, 2, 0, 5, 0, 0, 0, 3],
       [6, 0, 0, 0, 4, 7, 0, 9, 5]])

In [64]:
grid = env.reset()

In [65]:
flatgrid = grid.flatten()

In [66]:
flatgrid.reshape(9,9)

array([[0, 4, 9, 0, 3, 0, 2, 1, 0],
       [5, 2, 7, 9, 0, 1, 3, 0, 0],
       [0, 6, 0, 0, 2, 8, 0, 0, 0],
       [4, 8, 0, 0, 7, 0, 0, 3, 1],
       [0, 7, 0, 5, 0, 0, 8, 0, 0],
       [2, 0, 0, 0, 9, 0, 5, 6, 0],
       [1, 5, 0, 0, 0, 0, 6, 0, 2],
       [7, 9, 2, 0, 5, 0, 0, 0, 3],
       [6, 0, 0, 0, 4, 7, 0, 9, 5]])

In [67]:
env.render()

049 | 030 | 210
527 | 901 | 300
060 | 028 | 000
---------------
480 | 070 | 031
070 | 500 | 800
200 | 090 | 560
---------------
150 | 000 | 602
792 | 050 | 003
600 | 047 | 095




In [68]:
gym.__version__

'0.21.0'

# 1D Discrete Action Space 
* 729 choices

### Deep Q MLP
stable_baselines3.dqn

In [69]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.dqn import MlpPolicy, CnnPolicy

In [70]:
from gym.spaces import Discrete, Tuple

In [94]:
#model = DQN("MlpPolicy",env, verbose=1)
model = DQN.load("dqn_1_000_000_int_array_input", print_system_info=True)

== CURRENT SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0



In [95]:
# import time
# start = time.perf_counter()
# model.learn(total_timesteps=1_000_000)
# length = time.perf_counter() - start 

In [96]:
# print(f"{length/60.0} minutes")

In [97]:
action_choices = [(i,j,k) for i in range(9) for j in range(9) for k in range(9)]
action_to_choice = dict(zip(list(range(9**3)), action_choices))

In [98]:
def score_choice(action_int,obs_arr):
    """
    Score action before action applied to observation
    """
    x,y,val = action_to_choice[action_int]
    empty_cell = False if obs_arr[x][y] != 0 else True 
    exists_row = True if val in list(obs_arr[:,y])  else False
    exists_col = True if val in list(obs_arr[x,:])  else False
    br,bc = 3*(x//3), 3*(y//3)
    exists_box = any([obs_arr[i][j] == val for i in range(br, br+3) for j in range(bc,bc+3)])
    valid_move = (empty_cell and not exists_row and not exists_col and not exists_box)

    return {
        "empty_cell": empty_cell,
        "exists_row": exists_row,
        "exists_col": exists_col,
        "exists_box": exists_box,
        "valid_move":valid_move

    }

In [82]:
obs = env.reset()
obs_orig = obs 
outputs = []
for _ in range(2000):
    action, _states = model.predict(obs)
    outputs.append(score_choice(action.item(), obs))
    obs, reward, done, info = env.step(action.item())
env.render()

049 | 431 | 210
527 | 901 | 322
067 | 528 | 003
---------------
484 | 077 | 731
179 | 551 | 884
230 | 790 | 560
---------------
150 | [91m1[0m42 | 652
792 | 557 | 913
645 | 047 | 095




In [84]:
choices_df = pd.DataFrame(outputs)

In [91]:
print(f"Percent valid moves: {choices_df['valid_move'].sum()/len(choices_df)*100}%")

Percent valid moves: 0.2%


In [76]:
# model.save(path="dqn_1_000_000_int_array_input")

### PPO MLP

In [87]:
loaded_model = PPO.load("ppo_1_000_000", print_system_info=True)

== CURRENT SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.35 # 1 SMP Fri Jan 27 02:56:13 UTC 2023
- Python: 3.10.9
- Stable-Baselines3: 1.8.0
- PyTorch: 2.0.0+cu117
- GPU Enabled: True
- Numpy: 1.23.5
- Gym: 0.21.0



In [92]:
obs = env.reset()
obs_orig = obs 
outputs = []
for _ in range(2000):
    action, _states = loaded_model.predict(obs)
    outputs.append(score_choice(action.item(), obs))
    obs, reward, done, info = env.step(action.item())
env.render()

049 | 435 | 216
527 | 941 | 37[91m0[0m
069 | 128 | 870
---------------
486 | 971 | 831
275 | 550 | 844
223 | 499 | 560
---------------
153 | 276 | 622
792 | 752 | 663
621 | 247 | 695




In [93]:
choices_df = pd.DataFrame(outputs)
print(f"Percent valid moves: {choices_df['valid_move'].sum()/len(choices_df)*100}%")

Percent valid moves: 0.15%


## Box Action space