In [303]:
GOAL_REWARD = 100 
NORMAL_REWARD = -.01
DRAW_REWARD = 0

In [304]:
import numpy as np


class Soccer:
    '''
    Actions [0 : Left, 1 : Up, 2 : Right, 3 : Down, 4 : Stand]
    '''
    def __init__(self, h=4, w=5, pA=[3, 2], pB=[1, 1], goalPositions=[1, 2], ballOwner=0, drawProbability=0):
        self.h = h
        self.w = w
        self.goalPositions = np.array(goalPositions)
        self.positions = np.array([pA, pB])
        self.initPositions = np.array([pA, pB])
        self.ballOwner = ballOwner
        self.drawProbability = drawProbability
        #self.reward = 0
        self.rewards = np.array([0, 0])

    def reset(self, pA=None, pB=None, ballOwner=None):
        if pA is not None:
            self.initPositions[0] = pA

        if pB is not None:
            self.initPositions[1] = pB

        if ballOwner is None:
            ballOwner = self.choosePlayer()

        self.positions = self.initPositions.copy()
        self.ballOwner = ballOwner
        return self.positions[0] / self.w, self.positions[1] / self.h, self.ballOwner

    
    def _move(self, actionA, actionB):
        reward = DRAW_REWARD
        if np.random.rand() < self.drawProbability:
            return self.positions[0] / self.w, self.positions[1]/ self.h, self.ballOwner, reward, reward, True
        first = self.choosePlayer()
        actions = [actionA, actionB]
        m1 = self.move(first, actions[first])
        # print(m1)
        # if (m1[-1]):
        #     print(f'returning {m1}')
        #     return m1
        return self.move(1 - first, actions[1 - first])

    def move(self, player, action):
        opponent = 1 - player
       
        newPosition = self.positions[player] + self.actionToMove(action)

        reward = NORMAL_REWARD
        #self.rewards = np.array([0, 0])
        # If it's opponent position
        if (newPosition == self.positions[opponent]).all():
            self.ballOwner = opponent
        # If it's the goal
        elif self.ballOwner is player and self.isInGoal(*newPosition) >= 0:
            # reward = -2*( 1 - self.isInGoal(*newPosition)) + 1
            # self.rewards = np.array([reward, -reward ])
            reward = -2 * (1 - self.isInGoal(*newPosition)) + 1
            reward *= GOAL_REWARD
            return self.positions[0] / self.w, self.positions[1] / self.h, self.ballOwner, reward, -reward, True
        # If it's in board
        elif self.isInBoard(*newPosition):
            self.positions[player] = newPosition
        return self.positions[0] / self.w, self.positions[1]/ self.h, self.ballOwner, reward, reward, False

    def actionToMove(self, action):
        switcher = {
            0: [-1, 0],
            1: [0, 1],
            2: [1, 0],
            3: [0, -1],
            4: [0, 0],
        }
        return switcher.get(action)

    def isInGoal(self, x, y):
        g1, g2 = self.goalPositions
        if (g1 <= y <= g2):
            if x == -1:
                return 1
            elif x == self.w:
                return 0
        return -1

    def isInBoard(self, x, y):
        return (0 <= x < self.w and 0 <= y < self.h)

    def choosePlayer(self):
        return np.random.randint(0, 2)

    def draw(self, positions=None, ballOwner=None):
        positions = self.positions if positions is None else np.array(positions)
        ballOwner = self.ballOwner if ballOwner is None else ballOwner

        board = ''
        for y in range(self.h)[::-1]:
            for x in range(self.w):
                if ([x, y] == positions[0]).all():
                    board += 'A' if ballOwner == 0 else 'a'
                elif ([x, y] == positions[1]).all():
                    board += 'B' if ballOwner == 1 else 'b'
                else:
                    board += '-'
            board += '\n'

        print(board)



In [305]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class Net(nn.Module):
    def __init__(self,seed,input_dimension=6):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dimension, 10)  
        self.fc2 = nn.Linear(10, 5)  
        self.fc3 = nn.Linear(5, 1) 
        torch.manual_seed(seed)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def train(self,input_ls,label_ls):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=0.01)
        print("Before training")
        print(self.fc1.weight)
        for i in range(len(input_ls)):
            inputs = torch.tensor(input_ls[i],dtype=torch.float)
            labels = label_ls[i]
            optimizer.zero_grad()
            outputs = self.forward(inputs)
            loss = criterion(outputs, labels)
            print(loss.item())
            loss.backward()
            optimizer.step()
        print("After training")
        print(self.fc1.weight)
    
class Q_network:
    def __init__(self,max_cap=1000):
        seed = random.randint(0, 1000)
        self.target_network = self.build_model(seed)
        self.online_network = self.build_model(seed)
        self.replay_memory = deque()
    def build_model(self,seed):
        model = Net(seed=seed)
        return model
    

In [306]:
num_actions= 5

In [308]:
import random
import numpy as np
import matplotlib.pyplot as plt
def epsilon_greedy(Q_network,state,epsilon):
    if random.random() < epsilon:
        return random.randint(0,num_actions-1)
    else:
        Q_values = []
        for i in range(num_actions):
            state_action = np.concatenate((state,np.array([i])),axis=0)
            Q_value = Q_network.online_network.forward(torch.Tensor(state_action))
            Q_values.append((Q_value,i))
        Q_values.sort(reverse=True)
        return Q_values[0][1]

env = Soccer(drawProbability=0.01)
player_A = Q_network()
player_B = Q_network()
num_episodes = 100
epochs = 500
frequency = 10
epsilon=0.2
stochastic_param=10
gamma=1
buffer_ln = 1000
fill_memory = 20
wins_A = []
wins_B = []
episode_ln=[]
cnt_epoch_ended = 0
for episode in range(num_episodes):
    current_state_A,current_state_B,BallOwner = env.reset()
    for epoch in range(epochs):
        action_A = epsilon_greedy(player_A,np.concatenate((current_state_A,current_state_B,np.array([BallOwner])),axis=0),1)
        action_B = epsilon_greedy(player_B,np.concatenate((current_state_A,current_state_B,np.array([BallOwner])),axis=0),epsilon)
        result = env._move(action_A,action_B)
        next_state_A,next_state_B,next_BallOwner,reward_A,reward_B,done_env = result
        if(len(list(player_A.replay_memory))>buffer_ln):
            player_A.replay_memory.popleft()
        if(len(list(player_B.replay_memory))>buffer_ln):
            player_B.replay_memory.popleft()
        player_B.replay_memory.append((np.concatenate((current_state_A,current_state_B,np.array([BallOwner])),axis=0),action_B,reward_B,np.concatenate((next_state_A, next_state_B,np.array([next_BallOwner])),axis=0),done_env))
        current_state_A = next_state_A
        current_state_B = next_state_B
        if epoch < fill_memory and episode == 0:
            continue
        else:
            sample_for_A = random.sample(list(player_A.replay_memory), min(len(list(player_A.replay_memory)) ,(stochastic_param)))
            sample_for_B = random.sample(list(player_B.replay_memory), min(len(list(player_B.replay_memory)) ,(stochastic_param)))
            input_ls=[]
            label_ls=[]
            for sample in sample_for_A:
                state,action,reward,next_state,done = sample
                if done:
                    target = torch.tensor([reward],dtype=torch.float)
                else:
                    target = reward + gamma*max([player_A.target_network.forward(torch.Tensor(np.concatenate((next_state,[i]),axis=0))) for i in range(num_actions)])
                input_ls.append(np.concatenate((state,[action]),axis=0))
                label_ls.append(target)
            # player_A.online_network.train(input_ls,label_ls)
            input_ls=[]
            label_ls=[]
            for sample in sample_for_B:
                state,action,reward,next_state,done = sample
                if done:
                    target = torch.tensor([reward],dtype=torch.float)
                else:
                    target = reward + gamma*max([player_B.target_network.forward(torch.Tensor(np.concatenate((next_state,[i]),axis=0))) for i in range(num_actions)])
                input_ls.append(np.concatenate((state,[action]),axis=0))
                label_ls.append(target)
            print("Going to train for epoch and episode: ",epoch,episode)
            player_B.online_network.train(input_ls,label_ls)
        if epoch%frequency==0:
            player_A.target_network.load_state_dict(player_A.online_network.state_dict())
            player_B.target_network.load_state_dict(player_B.online_network.state_dict())
        if done_env:
            if episode%10==0:
                print("Episode number when done: ",episode)
            if reward_A == 1:
                wins_A.append(reward_A)
                wins_B.append(0)
            elif reward_B == 1:
                wins_A.append(0)
                wins_B.append(reward_B)
            else:
                wins_A.append(0)
                wins_B.append(0)
            episode_ln.append(epoch)
            break 
        if epoch==epochs-1:
            wins_A.append(0)
            wins_B.append(0)
            cnt_epoch_ended+=1
plt.plot(np.cumsum(np.array(wins_A)),label='Player A')
plt.plot(np.cumsum(np.array(wins_B)),label='Player B')
plt.xlabel('Episodes')
plt.ylabel('No of Wins')
plt.legend()
plt.show()
plt.plot(episode_ln)
print(cnt_epoch_ended)


Going to train for epoch and episode:  20 0
Before training
Parameter containing:
tensor([[ 0.5318, -0.4451,  0.1259,  0.4840,  0.3615, -0.2347],
        [ 0.1109, -0.5731,  0.1577,  0.4733, -0.2258,  0.1420],
        [ 0.1724, -0.4771, -0.0454,  0.2288,  0.0880,  0.4505],
        [ 0.0197, -0.3532,  0.0087,  0.2001, -0.1273,  0.4903],
        [-0.2138, -0.5713,  0.1511,  0.4509, -0.0986,  0.4635],
        [-0.4411, -0.1203,  0.5858, -0.3386,  0.6009, -0.2461],
        [-0.6036, -0.5581, -0.3761, -0.4842, -0.0088, -0.4199],
        [ 0.4560, -0.2269, -0.1313,  0.0821,  0.2057,  0.4607],
        [ 0.5660, -0.2842,  0.4025, -0.5263,  0.4562,  0.1217],
        [ 0.4876,  0.0521,  0.2647,  0.4566,  0.0184,  0.3586]],
       requires_grad=True)
9.999980829888955e-05
0.20125292241573334
0.0065110535360872746
0.004373324569314718
0.9269026517868042
9937.666015625
8.323508262634277
0.4705886244773865
0.6236046552658081
0.8133127689361572
After training
Parameter containing:
tensor([[ 0.5524, -

KeyboardInterrupt: 

In [None]:
print(sum(episode_ln)/len(episode_ln))

50.29
