In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Network
import torch
from torch import autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

# Optimizer
import torch.optim as optim

In [3]:
# Game setup
num_agents = 2   # Number of agents playing the game
num_types = 3    # Number of item types
max_item = 5     # Maximum number of each item in a pool
max_utility = 10 # Maximum utility value for agents
num_games = 128  # Number of games per episode

# Turn sampling
lam = 7     # Poisson parameter
max_N = 10  # Maximum number of turns
min_N = 4   # Minimum number of turns

# Linguistic channel
num_vocab = 10   # Symbol vocabulary size for linguistic channel
len_message = 6  # Linguistic message length

# Appendix
lambda1 = 0.05  # Entropy regularizer for pi_term, pi_prop
lambda2 = 0.001 # Entropy regularizer for pi_utt
smoothing_const = 0.7 # Smoothing constant for the exponential moving average baseline

In [4]:
# Sample an item pool for a game
def create_item_pool(num_types, max_item, batch_size):
    # Possible to have zero items?
    pool = np.random.randint(0, max_item+1, (batch_size,num_types))
    return torch.from_numpy(pool).long()
        
# Sample agent utility
def create_agent_utility(num_types, max_utility, batch_size):
    utility = np.zeros((batch_size,num_types)) # Initialize zero vector
    
    while 0 in np.sum(utility,1): # At least one item has to have non-zero utility
        utility = np.random.randint(0, max_utility+1, [batch_size, num_types])

    return torch.from_numpy(utility).long()


In [9]:
class combined_policy(nn.Module):
    def __init__(self, embedding_dim = 100, batch_size = 128, num_layers = 1, bias = True, batch_first = False, dropout = 0, bidirectional = False):
        super(combined_policy, self).__init__()
        # Save variables
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.log_p = 0
        
        # Encoding -------------------------------------------------------------
        
        # Numerical encoder
        self.encoder1 = nn.Embedding(max_utility+1, embedding_dim)
        # Linguistic encoder
        self.encoder2 = nn.Embedding(num_vocab+1, embedding_dim)
        
        # Item context LSTM
        self.lstm1 = nn.LSTM(embedding_dim, embedding_dim, num_layers, bias, batch_first, dropout, bidirectional)
        # Linguistic LSTM
        self.lstm2 = nn.LSTM(embedding_dim, embedding_dim, num_layers, bias, batch_first, dropout, bidirectional)
        # Proposal LSTM
        self.lstm3 = nn.LSTM(embedding_dim, embedding_dim, num_layers, bias, batch_first, dropout, bidirectional)
        
        # Feed-forward
        self.ff = nn.Linear(3*embedding_dim, embedding_dim)
        
        # Policy ---------------------------------------------------------------
        
        # Termination policy
        self.policy_term = nn.Linear(embedding_dim, 1)
        # Linguistic policy
        self.policy_ling = nn.LSTM(embedding_dim, embedding_dim, num_layers, bias, batch_first, dropout, bidirectional)
        self.ff_ling = nn.Linear(embedding_dim, num_vocab)
        # Proposal policies
        self.policy_prop = []
        for i in range(num_types):
            ff = nn.Linear(embedding_dim, max_item)
            self.policy_prop.append(ff)
        
    def forward(self, x, test, batch_size=128):
        # Variables
        self.batch_size = batch_size
        
        # Extract inputs ------------------------------------------------------------
        
        # Item context
        x1 = x[0]
        # Previous linguistic message
        x2 = x[1]
        # Previous proposal
        x3 = x[2]  

        # Encoding ------------------------------------------------------------------

        # Initial embedding
        x1 = self.encoder1(x1).transpose(0,1)
        x2 = self.encoder2(x2).transpose(0,1)
        x3 = self.encoder1(x3).transpose(0,1) # Same as item context
            
        # LSTM for item context
        h = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial hidden
        c = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial cell

        for i in range(x1.size()[0]):
            _, (h,c) = self.lstm1(x1[i].view(1,self.batch_size,self.embedding_dim),(h,c))
        x1 = h
        
        # LSTM for linguistic
        h = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial hidden
        c = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial cell

        for i in range(x2.size()[0]):
            _, (h,c) = self.lstm2(x2[i].view(1,self.batch_size,self.embedding_dim),(h,c))
        x2 = h
        
        # LSTM for proposal
        h = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial hidden
        c = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial cell

        for i in range(x3.size()[0]):
            _, (h,c) = self.lstm2(x3[i].view(1,self.batch_size,self.embedding_dim),(h,c))
        x3 = h

        # Concatenate side-by-side
        x = torch.cat([x1,x2,x3],2)

        # Feedforward
        h = self.ff(x)
        h = F.relu(h) # Hidden layer input for policy networks
        
        # Policy ------------------------------------------------------------------

        # Termination -----------------------------------------------
        p_term = F.sigmoid(self.policy_term(h)).view(self.batch_size,1)
        
        # Entropy
        entropy_term = torch.zeros(1,requires_grad=True)
        entropy_term = -(p_term * p_term.log2()) - (torch.ones(self.batch_size,1)-p_term * (torch.ones(self.batch_size,1)-p_term.log2()))
    
        if test:
            # Greedy
            term = torch.round(p_term).long()
        else:
            # Sample
            term = torch.bernoulli(p_term).long()
            
        # log p for REINFORCE
        log_p_term = torch.zeros(self.batch_size,1,requires_grad=True)
        log_p_term = (term.float() * p_term + (torch.ones(self.batch_size,1)-term.float()) * (torch.ones(self.batch_size,1)-p_term)).log2()
        
        # Linguistic construction ----------------------------------
        h = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial hidden state
        c = torch.zeros(1,self.batch_size,self.embedding_dim) # Initial cell state
        letter = torch.zeros(self.batch_size,1).long() # Initial letter (dummy)
        entropy_letter = torch.zeros([self.batch_size,len_message])
        
        # log p for REINFORCE 
        log_p_letter = torch.zeros([self.batch_size,1],requires_grad=True)

        message = torch.zeros(self.batch_size,len_message) # Message
        for i in range(len_message):
            embedded_letter = self.encoder2(letter)

            _, (h,c) = self.policy_ling(embedded_letter.view(1,self.batch_size,self.embedding_dim),(h,c))
            logit = self.ff_ling(h)
            p_letter = F.softmax(logit,dim=2).view(self.batch_size,num_vocab)

            entropy_letter[:,i] = -torch.sum(p_letter*p_letter.log2(),1)

            if test:
                # Greedy
                letter = p_letter.argmax(dim=1).view(self.batch_size,1).long()
            else:
                # Sample
                letter = torch.multinomial(p_letter,1).long()
                
            # Gather the probabilities for the letters we've picked
            probs = torch.gather(p_letter, 1, letter)
            log_p_letter = log_p_letter + probs.log2()
                
            message[:,i] = letter.squeeze()
            
        message = message.long()
        entropy_letter = torch.sum(entropy_letter,1)     
   
        # Proposal ----------------------------------------------
        p_prop = torch.zeros(num_types,self.batch_size,max_item)
        prop = torch.zeros([self.batch_size,num_types]).long()
        entropy_prop = torch.zeros([self.batch_size,num_types],requires_grad=True)
        
        # log p for REINFORCE 
        log_p_prop = torch.zeros([self.batch_size,1],requires_grad=True)
        
        for i in range(num_types):
            blah = F.sigmoid(self.policy_prop[i](h))
            p_prop[i] = F.sigmoid(self.policy_prop[i](h))
            
            entropy_prop[:,i] = -torch.sum(p_prop[i]*p_prop[i].log2(),1)
            
            if test:
                # Greedy
                prop[:,i] = p_prop[i].argmax(dim=1)
            else:
                # Sample
                prop[:,i] = torch.multinomial(p_prop,1)
                
            # Gather the probabilities for the letters we've picked
            probs = torch.gather(p_prop[i], 1, prop[:,i].view(self.batch_size,1))
            log_p_prop = log_p_prop + probs.log2()
            
        entropy_prop = torch.sum(entropy_prop,1) # Entropy for exploration        

        # Combine -----------------------------------------------------------------
        entropy_loss = torch.sum(entropy_term + entropy_prop + entropy_letter)
        self.log_p = log_p_term + log_p_letter + log_p_prop
        
        return (term,message,prop, entropy_loss)
    
# Calculate reward (self-interested)
def rewards_func(share, utility, pool, log_p, baseline):
    print('share = ' + str(share.size()))
    print('utility = ' + str(utility.size()))
    print('pool = ' + str(pool.size()))
    reward = torch.sum(utility*share,1)/torch.sum(utility*pool,1)
    
    reward_loss = torch.sum(log_p,1) # Sum over different policies
    reward_loss = reward_loss * (reward - baseline)
    reward_loss = reward_loss.mean() # Average over batches
    
    return reward, reward_loss
    
net = combined_policy()

In [10]:
x = torch.randint(0,max_item,[128,6]).long()
y = torch.randint(0,num_vocab,[128,6]).long()
z = torch.randint(0,max_item,[128,3]).long()

In [16]:
blah = net([x,y,z],True)

In [8]:
torch.manual_seed(5)

# Agents
Agents = []
for i in range(num_agents):
    Agents.append(combined_policy())
    
baselines = [0 for _ in range(num_agents)]
    
# Train REINFORCE
alpha = 0.001     # learning rate
N_ep = 50   # Number of episodes
num_games = 128 # Number of games per episode (batch size)

# Initialize optimizer to update the DQN
optimizers = []
for i in range(num_agents):
    optimizers.append(optim.Adam(Agents[i].parameters(), alpha))

# Loop over episodes
for i_ep in range(N_ep):
    # Setting up games -----------------------------------------------------------------------
    
    # Truncated Poisson sampling
    N = np.random.poisson(lam,num_games) 
    N = np.minimum(N,max_N)
    N = np.maximum(N,min_N)
    N = torch.from_numpy(N)
    
    # Setting
    pool = create_item_pool(num_types, max_item, num_games) # Item pool
    item_contexts = []
    for i in range(num_agents):
        utility = create_agent_utility(num_types, max_utility, num_games)
        item_contexts.append(torch.cat([pool,utility],1))
    
    # Initialization
    survivors = torch.ones(num_games).nonzero() # Everyone alive initially
    prev_messages = torch.zeros(num_games,len_message).long() # Previous linguistic messages
    prev_proposals = torch.zeros(num_games,num_types).long()   # Previous proposals
    num_alive = len(survivors)
    
    entropy_losses = [torch.zeros(1,requires_grad=True),torch.zeros(1,requires_grad=True)]
    rewards = [torch.zeros(num_games),torch.zeros(num_games)]
    reward_losses = [torch.zeros(num_games,requires_grad=True),torch.zeros(num_games,requires_grad=True)]
    reward_sums = []
    for i in range(num_agents):
        reward_sums.append(torch.zeros(1))
    
    # Play the games -------------------------------------------------------------------------
    for i_turn in range(max_N):
        # IDs
        id_1 = i_turn % 2    # Current player
        id_2 = int(not id_1) # Other player
        
        # Sieve
        N = N[survivors].view(num_alive,1)

        pool = pool[survivors].view(num_alive,num_types)
        prev_messages = prev_messages[survivors].view(num_alive,len_message)
        prev_proposals = prev_proposals[survivors].view(num_alive,num_types)
        for j in range(num_agents):
            item_contexts[j] = item_contexts[j][survivors].view(num_alive,num_types*2)
            rewards[j] = rewards[j][survivors].view(num_alive)
            reward_losses[j] = reward_losses[j][survivors].view(num_alive)
        
        # Agent that is playing
        Agent = Agents[id_1]             
        item_context = item_contexts[id_1]
        
        # Actually play the game
        term,prev_messages,proposals,entropy_loss = Agent([item_context,prev_messages,prev_proposals], True, num_alive)
        entropy_losses[id_1] = entropy_losses[id_1] + entropy_loss
        
        # Compute reward loss (assumes 2 agents) ------------------------------------
        finishers = term.squeeze().nonzero()
        
        share_2 = prev_proposals[finishers].squeeze() # Share of other   
        share_1 = pool[finishers].squeeze() - share_2 # Share of this agent
                                       
        utility_1 = item_contexts[id_1][:,num_types:]
        utility_1 = utility_1[finishers].squeeze()
        utility_2 = item_contexts[id_2][:,num_types:]
        utility_2 = utility_2[finishers].squeeze()
        
        pool_12 = pool[finishers].squeeze()
                                      
        # Calculate reward and reward losses
        r1, rl1 = rewards_func(share_1, utility_1, pool_12, Agent.log_p, baselines[id_1])
        r2, rl2 = rewards_func(share_2, utility_2, pool_12, Agents[id_2].log_p, baselines[id_2])

        # Add rewards and reward losses
        rewards[id_1] += r1
        rewards[id_2] += r2
        reward_losses[id_1] += rl1
        reward_losses[id_2] += rl2
        
        proposals = prev_proposals # Don't need previous proposals anymore
        
        # Compute loss function ------------------------------------------------------
        for i in range(num_agents):
            # optimize
            loss = reward_loss[i] + entropy_loss[i]
            optimizers[i].zero_grad()
            loss.backward()
            optimizers[i].step()
        
        # Remove finished games
        # In term, element = 1 means die
        term_N = (N <= i_turn).view(num_alive,1).long()
        # In survivors, element = 1 means live
        survivors = (term+term_N) == 0
        
        # Update baselines using corpses
        corpses = (term+term_N) != 0
        num_corpses = corpses.sum()
        if num_corpses > 0: # Ignore if nobody dead
            corpses = ((term+term_N) == 0).nonzero()[:,0]
            
            for i in range(num_agents):
                reward_sums[i] += rewards[i][corpses].view(num_corpses).sum() # Mean reward over finished games
                
        # Check if everyone's dead
        if survivors.sum() == 0: # If all games over, break episode
            # Update the baselines
            for i in range(num_agents):
                baselines[i] = smoothing_const * baselines[i] + (1-smoothing_const)*reward_sums[i]

            break;
            
        # Reshape
        survivors = ((term+term_N) == 0).nonzero()[:,0]
        num_alive = len(survivors) # Number of survivors

        print('i_turn = ' + str(i_turn))
        
    print('i_ep = ' + str(i_ep))

NameError: name 'entropy_loss' is not defined

In [None]:
80*6*100

In [None]:
survivors