In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import enum
import copy
import connect4.connect4 as game
from pympler import asizeof
import deeplearning.buffer as buf
import torch 
import torch.nn as nn
import torch.optim as optim
import deeplearning.mlp as mlp
import torch.nn.functional as F

In [2]:
env = game.Connect4()
randomPlayer1 = game.RandomPlayer()
greedyPlayer2 = game.GreedyRandomPlayer()
buffer = buf.ReplayBuffer(20000)

In [3]:
gm = game.GameManager([randomPlayer1, greedyPlayer2])
gm.play(10000, game.Connect4, buffer)
gm.info()

gm = game.GameManager([randomPlayer1, randomPlayer1])
gm.play(5000, game.Connect4, buffer)
gm.info()

gm = game.GameManager([greedyPlayer2, greedyPlayer2])
gm.play(5000, game.Connect4, buffer)
gm.info()

ELO Before:  940 1060
ELO After:  944.0 1056.0
p1:  0.3371 p2:  0.6612 draw:  0.0017
ELO Before:  944.0 944.0
ELO After:  944.0 944.0
p1:  0.5044 p2:  0.4912 draw:  0.0044
ELO Before:  1056.0 1056.0
ELO After:  1056.0 1056.0
p1:  0.4818 p2:  0.5078 draw:  0.0104


In [33]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bf = mlp.BF(42, 7, 5000, 1, device).to(device)
optimizer = optim.Adam(params=bf.parameters(), lr=1e-5)

In [34]:
def train_behavior_function(batch_size, model, optimizer):
    """
    Trains the BF with on a cross entropy loss were the inputs are the action probabilities based on the state and command.
    The targets are the actions appropriate to the states from the replay buffer.
    """
    X, y = buffer.create_training_examples(batch_size)


    X = torch.stack(X)


    state = X[:,0:42]
    d = X[:,42:42+1]
    e = X[:,43:43+1]
    command = torch.cat([d,e], dim=-1)
    command2 = torch.cat([-d,e], dim=-1)
    y = torch.FloatTensor((y)).to(device).long()
    y_ = model(state.to(device), command.to(device)).float()
    optimizer.zero_grad()
    pred_loss = F.cross_entropy(y_, y)   
    pred_loss.backward()
    optimizer.step()
    return pred_loss.detach().cpu().numpy()

def run_loop():
    i = 0
    cum_loss = 0
    best_loss = 100
    best_loss_i = 0
    while True:
        i += 1
        loss = train_behavior_function(1000, bf, optimizer)
        if loss < best_loss - 0.001:
            best_loss = loss
            best_loss_i = i
        if i - best_loss_i > 100:
            print("early stopping")
            return
        cum_loss += loss
        if i % 100 == 0:
            print(i, cum_loss, best_loss_i, best_loss)
            cum_loss = 0
run_loop()

100 194.4607731103897 88 1.9422095
200 194.05811738967896 174 1.9377781
300 193.70407283306122 244 1.9341537
400 193.39173138141632 398 1.9286159
500 193.09932851791382 487 1.9249791
600 192.86251831054688 586 1.9223009
700 192.61722922325134 662 1.919573
800 192.4673353433609 800 1.9151205
900 192.2656056880951 878 1.9130182
1000 192.0796616077423 916 1.9115233
early stopping


In [43]:
s = env.reset()
x = bf.steps([s], -1, 900)
print(x[0])

[0.33510917 0.08345829 0.0130851  0.5549511  0.00471462 0.00406335
 0.00461842]


In [45]:
with torch.no_grad():
    buf2 = buf.ReplayBuffer(20000)
    gm = game.GameManager([bf, greedyPlayer2])
    gm.play(1000, game.Connect4, buf2, 1060)
    gm.info() # p1:  0.6 p2:  0.4 draw:  0.0

ELO Before:  1028.0 970.0
ELO After:  1032.0 966.0
p1:  0.62 p2:  0.375 draw:  0.005


In [184]:
buffer.create_training_examples(1)

([tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
          -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
          -0.0000,  1.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
          -1.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,  1.0000,
          -0.0000, -0.0000, -0.0000, -1.0000, -1.0000, -1.0000,  1.0000,  1.0000,
          -1.0000, -0.0000, -1.0000,  1.0600])],
 [0])

In [46]:
def viewer(a):
    print("https://connect4.gamesolver.org/en/?pos=" + "".join([str(x+1) for x in a]))

In [47]:
[viewer(buf2.buffer[i]['actions']) for i in range(5)]
[buf2.buffer[i]['elo'] for i in range(5)]

https://connect4.gamesolver.org/en/?pos=1444454416111122222233353
https://connect4.gamesolver.org/en/?pos=4444461741111122222523363
https://connect4.gamesolver.org/en/?pos=474444141111122222233
https://connect4.gamesolver.org/en/?pos=4445134442111112262223333355555666
https://connect4.gamesolver.org/en/?pos=241447444711111627232


[[1028.0, 970.0],
 [1028.0, 970.0],
 [1028.0, 970.0],
 [1028.0, 970.0],
 [1028.0, 970.0]]

In [None]:
s = env.reset()

In [None]:
s, a, d, i = env.step(bf.step(s.flatten(), 1))
print(s)

In [None]:
s, a, d, i = env.step([0, 0, 0, 1, 0, 0, 0])
print(s)

In [97]:
# some fun visualizers
# elo
a = env.reset()
for x in range(0, 2000, 100):
    print(x, bf.steps([a], 1, x)[0][3])


0 0.16947517
100 0.2030002
200 0.2503905
300 0.29866624
400 0.3451594
500 0.38811266
600 0.4266829
700 0.4607666
800 0.49070683
900 0.5170649
1000 0.5404376
1100 0.56134963
1200 0.5802656
1300 0.5975504
1400 0.6134952
1500 0.62832177
1600 0.64220375
1700 0.65528154
1800 0.6676629
1900 0.67943215


In [32]:
l = [0 for i in range(0,10)]
for i in range(0, 1000):
    t1 = int(np.random.power(1.5, 1)[0]*10)
    l[t1] += 1
l

[26, 62, 75, 95, 105, 97, 120, 133, 151, 136]

In [48]:
from deeplearning import league

In [60]:
lel = league.League()

In [61]:
len(lel.buffer.buffer)

0

In [64]:
lel.play_season()

Random Play
0  vs  2
p1:  0.224 p2:  0.776 draw:  0.0
1  vs  1
p1:  0.483 p2:  0.505 draw:  0.012
2  vs  0
p1:  0.769 p2:  0.23 draw:  0.001
2  vs  1
p1:  0.704 p2:  0.292 draw:  0.004
1  vs  2
p1:  0.265 p2:  0.73 draw:  0.005
2  vs  1
p1:  0.698 p2:  0.298 draw:  0.004
1  vs  1
p1:  0.497 p2:  0.497 draw:  0.006
2  vs  1
p1:  0.699 p2:  0.297 draw:  0.004
0  vs  2
p1:  0.21 p2:  0.79 draw:  0.0
2  vs  1
p1:  0.714 p2:  0.28 draw:  0.006
2  vs  0
p1:  0.77 p2:  0.23 draw:  0.0
1  vs  2
p1:  0.317 p2:  0.681 draw:  0.002
1  vs  0
p1:  0.689 p2:  0.309 draw:  0.002
0  vs  1
p1:  0.329 p2:  0.67 draw:  0.001
1  vs  2
p1:  0.285 p2:  0.708 draw:  0.007
0  vs  2
p1:  0.225 p2:  0.775 draw:  0.0
0  vs  1
p1:  0.313 p2:  0.685 draw:  0.002
1  vs  1
p1:  0.483 p2:  0.502 draw:  0.015
0  vs  0
p1:  0.496 p2:  0.499 draw:  0.005
1  vs  1
p1:  0.497 p2:  0.491 draw:  0.012
1  vs  1
p1:  0.474 p2:  0.507 draw:  0.019
2  vs  1
p1:  0.706 p2:  0.293 draw:  0.001
0  vs  1
p1:  0.33 p2:  0.669 draw: 

NameError: name 'agent' is not defined