# Results
## Evolutionary Strategies and Reinforcement Learning applied to Minesweeper

**Authors**

_Jacob J. Hansen (s134097), Jakob D. Havtorn (s132315),_

_Mathias G. Johnsen (s123249) and Andreas T. Kristensen (s144026)_

### Initialization

In [1]:
# Initialization
import multiprocessing as mp
import os
import pstats
import time
import gym
import numpy as np
from keras.layers import Dense, Conv2D, Flatten
from keras.models import Input, Model, Sequential, clone_model, load_model
from keras.optimizers import Adam

from minesweeper_tk import Minesweeper

save_dir = ''
mp.freeze_support()

Using Theano backend.


### Evolutionary Strategies

In [2]:
# Define fitness evaluation function
def fitnessfun(env, model):
    total_reward = 0
    done = False
    observation = env.reset()
    steps = 0
    while not done and steps < rows*cols-mines:
        # Predict action
        action = model.predict(observation.reshape((1, rows, cols, n_chs)))
        # Mask invalid moves (no need to renormalize when argmaxing)
        mask = env.get_validMoves().flatten()
        action[0, ~mask] = 0
        # Step and get reward
        observation, reward, done, info = env.step(np.argmax(action))
        total_reward += reward
        steps += 1
    win = True if done and reward is 0.9 else False
    return total_reward, {'steps': steps, 'win': win}

# Define test function
def testfun(env, model, episodes):
    total_reward = 0
    wins = 0
    for i in range(episodes):
        observation = env.reset()
        done = False
        t = 0
        while not done and t < rows*cols-mines:
            action = model.predict(observation.reshape((1, rows, cols, n_chs)))
            observation, reward, done, info = env.step(np.argmax(action))
            total_reward += reward
            t += 1
        if i % 100 == 0:
            print('Episode: {: >3d}'.format(i))
        wins += 1 if done and reward is 0.9 else 0
    return total_reward/episodes, t, wins

In [3]:
# Define environment 
rows = 4
cols = 4
mines = 4
OUT = 'FULL'
rewards_structure = {"win": 0.9, "loss": -1, "progress": 0.9, "noprogress": -0.3, "YOLO": -0.3}
env = Minesweeper(display=False, OUT=OUT, ROWS=rows, COLS=cols, MINES=mines, rewards=rewards_structure)

In [4]:
# Define model
obs = env.reset()
n_chs = obs.shape[-1]
n_hidden = [200, 200, 200, 200]
n_outputs = rows*cols
model = Sequential()
# Convs
model.add(Conv2D(15, (5, 5), input_shape=(rows, cols, n_chs), padding='same', activation='relu'))
model.add(Conv2D(35, (3, 3), padding='same', activation='relu'))
# Dense
model.add(Flatten())
model.add(Dense(units=n_hidden[0],
                activation='relu',
                kernel_initializer='glorot_uniform',
                bias_initializer='zeros',
                kernel_regularizer=None,#l2(reg),
                bias_regularizer=None))#l2(reg)))
# Hidden
for n_units in n_hidden[1:]:
    model.add(Dense(units=n_units,
                    activation='relu',
                    kernel_initializer='glorot_uniform',
                    bias_initializer='zeros',
                    kernel_regularizer=None,#l2(reg),
                    bias_regularizer=None))#l2(reg)))
# Output
model.add(Dense(units=n_outputs,
                activation='softmax',
                kernel_initializer='glorot_uniform',
                bias_initializer='zeros',
                kernel_regularizer=None,
                bias_regularizer=None))

model.compile(optimizer='rmsprop', loss='mean_squared_error')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 4, 4, 15)          3765      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 35)          4760      
_________________________________________________________________
flatten_1 (Flatten)          (None, 560)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               112200    
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_3 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_4 (Dense)              (None, 200)               40200     
__________

In [5]:
# (Train)
do_train = False
if do_train:
    from context import core
    from core.strategies import ES
    regu = 0.01
    nags = 20
    lrte = 0.01
    sigm = 0.01
    cint = 100
    nwrk = mp.cpu_count()
    e = ES(fun=fitnessfun, model=model, env=env, reg={'L2': regu}, population=nags, learning_rate=lrte, sigma=sigm, workers=nwrk, save_dir=save_dir)
    e.evolve(ngns, checkpoint_every=cint, plot_every=cint)

In [7]:
# Load pretrained model
test_episodes = 1000
model = load_model('es-model.h5')
env = Minesweeper(display=False, OUT=OUT, ROWS=rows, COLS=cols, MINES=mines, rewards=rewards_structure)

# Run game env and save rewards and winrate for 100 games
average_reward, _, wins = testfun(env, model, test_episodes)
print('Win rate: {}'.format(wins/test_episodes))
print('Average test reward: {}'.format(average_reward))

Episode:   0
Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Win rate: 0.0
Average test reward: -0.38010000000001887


### Q-Learning

In [1]:
# Load the function
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
sys.path.append('q_learning')
from train import *
tf.reset_default_graph()

In [2]:
# Get the 90.20 win-rate
setup_model("test")

Test minesweeper model on 6x6 board with 6 mines
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_best/checkpoints_minesweeper/model-best
  Win Rate: 90.20


In [2]:
# Get the win-rates for different number of mines
tf.reset_default_graph()
setup_model("test_random_mines")

Test minesweeper model on 6x6 board with a random number of mines

Testing with best model on random number of mines
Mines = 1
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_best/checkpoints_minesweeper/model-best-random
  Win Rate: 75.48
Mines = 2
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_best/checkpoints_minesweeper/model-best-random
  Win Rate: 100.00
Mines = 3
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_best/checkpoints_minesweeper/model-best-random
  Win Rate: 100.00
Mines = 4
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_best/checkpoints_minesweeper/model-best-random
  Win Rate: 99.03
Mines = 5
INFO:tensorflow:Restoring parameters from /home/andreas/github/minesweeper_solver/results-ipynb/q_learning/output_b

In [2]:
tf.reset_default_graph()
setup_model("train")

Training the network which obtained 90.20% win-rate on 6x6 board with 6 mines
Initializing replay memory 0/50000
Initializing replay memory 1000/50000
Initializing replay memory 2000/50000
Initializing replay memory 3000/50000
Initializing replay memory 4000/50000
Initializing replay memory 5000/50000
Initializing replay memory 6000/50000
Initializing replay memory 7000/50000
Initializing replay memory 8000/50000
Initializing replay memory 9000/50000
Initializing replay memory 10000/50000
Initializing replay memory 11000/50000
Initializing replay memory 12000/50000
Initializing replay memory 13000/50000
Initializing replay memory 14000/50000
Initializing replay memory 15000/50000
Initializing replay memory 16000/50000
Initializing replay memory 17000/50000
Initializing replay memory 18000/50000
Initializing replay memory 19000/50000
Initializing replay memory 20000/50000
Initializing replay memory 21000/50000
Initializing replay memory 22000/50000
Initializing replay memory 23000/50000

KeyboardInterrupt: 

### Policy Gradient

In [1]:
#import os
#os._exit(00) # Restart jyputer Python kernel
# Load model weights
# Run game env and save rewards and winrate for 100 games
import numpy as np
import pandas as pd
import IPython
import matplotlib.pyplot as plt
import pickle 
epochs = []
steps = []
train_r = []
loss = []
winrate = []

stats = pickle.load(open("minesweeper/stats.p", "rb"))
for stat in stats:
    steps.append(stat[1])
    train_r.append(stat[2]/1.5)
    loss.append(stat[3])
    winrate.append(stat[4])

   
plt.rcParams.update({'font.size': 33}) 
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)

ax1.plot(steps, train_r, steps, loss)
ax1.grid()
ax1.legend(["train reward", "loss"])

ax2.plot(steps, winrate, linewidth = 4.0)
ax2.legend(["win rate"])
plt.xlabel('Steps / Actions')
ax2.grid()

plt.show()

IndexError: list index out of range

### Comparison

In [None]:
# Plot rewards and winrate in a single plot for comparison