In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class Net(nn.Module):
    def __init__(self,seed,input_dimension=6):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dimension, 10)  
        self.fc2 = nn.Linear(10, 5)  
        self.fc3 = nn.Linear(5, 1) 
        torch.manual_seed(seed)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def train(self,input_ls,label_ls):
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=0.01)
        for i in range(len(input_ls)):
            inputs = torch.tensor(input_ls[i],dtype=torch.float).unsqueeze(0)
            labels = torch.tensor(label_ls[i],dtype=torch.float).unsqueeze(0)
            optimizer.zero_grad()
            outputs = self.forward(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
class Q_network:
    def __init__(self,max_cap=1000):
        seed = random.randint(0, 1000)
        self.target_network = self.build_model(seed)
        self.online_network = self.build_model(seed)
        self.replay_memory = deque()
    def build_model(self,seed):
        model = Net(seed=seed)
        return model
    

In [12]:
from soccer import Soccer
# from neural_net import Q_network
import random
import numpy as np
import matplotlib.pyplot as plt
def epsilon_greedy(Q_network,state,epsilon):
    if random.random() < epsilon:
        return random.randint(0,5)
    else:
        Q_values = []
        for i in range(6):
            state_action = np.concatenate((state,i),axis=0)
            Q_value = Q_network.online_network.forward(state_action)
            Q_values.append((Q_value,i))
        Q_values.sort(reverse=True)
        return Q_values[0][1]

def main():
    env = Soccer(drawProbability=0.01)
    player_A = Q_network()
    player_B = Q_network()
    num_episodes = 1000
    epochs = 100000
    frequency = 50
    epsilon=0.1
    stochastic_param=20
    gamma=0.9
    fill_memory = 100
    wins_A = []
    wins_B = []
    for episode in range(num_episodes):
        current_state_A,current_state_B,BallOwner = env.restart()
        for epoch in range(epochs):
            # for agent a
            action_A = epsilon_greedy(player_A,np.concatenate((current_state_A,BallOwner),axis=0),epsilon)
            action_B = epsilon_greedy(player_B,np.concatenate((current_state_B,BallOwner),axis=0),epsilon)
            next_state_A,next_state_B,next_BallOwner,reward_A,reward_B,done_env = env.move(action_A,action_B)
            if(len(list(player_A.replay_memory))>1000):
                player_A.replay_memory.popleft()
            player_A.replay_memory.append((np.concatenate((current_state_A,current_state_B,BallOwner),axis=0),action_A,reward_A,np.concatenate((next_state_A,next_BallOwner),axis=0),done_env))
            if(len(list(player_B.replay_memory))>1000):
                player_B.replay_memory.popleft()
            player_B.replay_memory.append((np.concatenate((current_state_A,current_state_B,BallOwner),axis=0),action_B,reward_B,np.concatenate((next_state_B,next_BallOwner),axis=0),done_env))
            current_state_A = next_state_A
            current_state_B = next_state_B
            if epoch < fill_memory and episode == 0:
                continue
            else:
                sample_for_A = random.sample(list(player_A.replay_memory), stochastic_param)
                sample_for_B = random.sample(list(player_B.replay_memory), stochastic_param)
                input_ls=[]
                label_ls=[]
                for sample in sample_for_A:
                    state,action,reward,next_state,done = sample
                    if done:
                        target = reward
                    else:
                        target = reward + gamma*max([player_A.target_network.forward(np.concatenate((next_state,i),axis=0)) for i in range(6)])
                    input_ls.append(np.concatenate((state,action),axis=0))
                    label_ls.append(target)
                player_A.online_network.train(input_ls,label_ls)
                input_ls=[]
                label_ls=[]
                for sample in sample_for_B:
                    state,action,reward,next_state,done = sample
                    if done:
                        target = reward
                    else:
                        target = reward + gamma*max([player_B.target_network.forward(np.concatenate((next_state,i),axis=0)) for i in range(6)])
                    input_ls.append(np.concatenate((state,action),axis=0))
                    label_ls.append(target)
                # player_B.online_network.train(input_ls,label_ls)
            if epoch%frequency==0:
                player_A.target_network.load_state_dict(player_A.online_network.state_dict())
                player_B.target_network.load_state_dict(player_B.online_network.state_dict())
            if done_env:
                if reward_A == 1:
                    wins_A.append(reward_A)
                    wins_B.append(0)
                elif reward_B == 1:
                    wins_A.append(0)
                    wins_B.append(reward_B)
                else:
                    wins_A.append(0)
                    wins_B.append(0)
                break 
    plt.plot(np.cumsum(np.array(wins_A)),label='Player A')
    plt.plot(np.cumsum(np.array(wins_B)),label='Player B')
    plt.legend()
    plt.show()


In [13]:
main()

AttributeError: 'Soccer' object has no attribute 'reset'