[View in Colaboratory](https://colab.research.google.com/github/hjjimmykim/prospectiveporpoise/blob/master/RNN_backup.ipynb)

Base iterated prisoner's dilemma game

# Libraries

In [0]:
# Standard
import numpy as np
import matplotlib.pyplot as plt
import copy

import tensorflow as tf

# cuda
use_cuda = 1

# Parameters

In [0]:
#Reward structure
T = 5 # Temptation
R = 3 # Reward
P = 1 # Penalty
S = 0 # Sucker

# Reward matrix (0 = cooperate, 1 = defect)
RM = np.zeros([2,2,2])
RM[0][0][0] = R
RM[0][0][1] = R
RM[0][1][0] = S
RM[0][1][1] = T
RM[1][0][0] = T
RM[1][0][1] = S
RM[1][1][0] = P
RM[1][1][1] = P

# RL
gamma = 1
alpha = 0.1

# Tensorflow stuff
n_eps = 5200       # Number of episodes
n_turns = 2      # Number of turns per episode
n_hidden  = 15
layers = 1
epsilon = 0.2       # Exploration parameter (epsilon-greedy)
epsilon_dr = 0.9999
input_dim = 2


# 0-layer Deep Q-Network

In [31]:
# 1. Experience replay
# 2. Clamped Q-network

tf.reset_default_graph()
class RNN():
  
  def __init__(self, input_dim, n_hidden):
    # x = e.g. [[0,0], [-1,1], ...]
    # h = LSTM(x) (1 x n_hidden)
    # output = Wh + b (1 x 2)
    
    self.input_dim = input_dim
    
    # Weight initialization
    self.W = tf.Variable(tf.random_normal([n_hidden, input_dim]))
    self.b = tf.Variable(tf.random_normal([input_dim]))
    
    self.rnn_cell = tf.contrib.rnn.LSTMCell(n_hidden,state_is_tuple=True)

  def forward(self, x):
    # generate prediction
    outputs, states = tf.nn.dynamic_rnn(self.rnn_cell, x, dtype=tf.float32)

    # there are input_dim outputs but
    # we only want the last output
    outputs = tf.reshape(outputs,[n_turns,n_hidden])
    last_output = tf.reshape(outputs[-1],[1,n_hidden])

    return tf.matmul(last_output, self.W) + self.b
  
class FWN():
  
  def __init__(self, input_dim, n_hidden, output_dim):
    # x = e.g. [[0,0], [-1,1], ...]
    # h = LSTM(x) (1 x n_hidden)
    # output = Wh + b (1 x 2)
    
    self.input_dim = input_dim
    
    # Weight initialization
    self.W1 = tf.Variable(tf.random_normal([input_dim, n_hidden]))
    self.b1 = tf.Variable(tf.random_normal([n_hidden]))
    
    self.W2 = tf.Variable(tf.random_normal([n_hidden, output_dim]))
    self.b2 = tf.Variable(tf.random_normal([output_dim]))
    
  def forward(self, x):
    x = tf.matmul(x, self.W1) + self.b1
    x = tf.nn.sigmoid(x)
    
    x = tf.matmul(x, self.W2) + self.b2
    
    return x

#tf.reset_default_graph()
sess = tf.InteractiveSession() # Initialize session

# Define Q-networks
RNN1 = FWN(n_turns*2, n_hidden, 2)
RNN2 = FWN(n_turns*2, n_hidden, 2)

'''
# Feedforward operation (RNN)
nextQ1 = tf.placeholder(shape=[n_turns,1],dtype=tf.float32)
x1 = tf.placeholder(shape=[None,n_turns,2],dtype=tf.float32)
Qout1 = RNN1.forward(x1)
  
#nextQ2 = tf.placeholder(shape=[n_turns,1],dtype=tf.float32)
#x2 = tf.placeholder(shape=[None,n_turns,2],dtype=tf.float32)
#Qout2 = RNN2.forward(x2)

# Pick Q-values for selected actions
action_history = tf.placeholder(shape=[n_turns,2],dtype=tf.int32)

#Qout1_chosen = np.choose(action_history[:,0],Qout1)
# Qout2_chosen = np.choose(action_history[:,1],Qout2)

indeces = tf.range(0, tf.shape(action_history[:,0])[0])*Qout1_chosen.shape[1] + action_history[:,0]
Qout1_train = tf.gather(tf.reshape(Qout1_chosen, [-1]), action_history[:,0])
'''

# Feedforward operation (FWN)
x1 = tf.placeholder(shape=[None,n_turns*2],dtype=tf.float32)
Qout1 = RNN1.forward(x1)

action_history = tf.placeholder(shape=[1],dtype=tf.int32) # Chosen action
Qout1_train = Qout1[action_history[0]]

# Loss and optimizer
nextQ1 = tf.placeholder(shape=[1,1],dtype=tf.float32)
cost1 = tf.reduce_sum(tf.square(nextQ1-Qout1_train))
optimizer1 = tf.train.GradientDescentOptimizer(learning_rate=alpha).minimize(cost1)

#cost2 = tf.reduce_sum(tf.square(nextQ2-Qout2))
#optimizer2 = tf.train.RMSPropOptimizer(learning_rate=alpha).minimize(cost2)

sess.run(tf.global_variables_initializer()) # Initialize variables

blah = sess.run(Qout1, feed_dict={x1:np.reshape(np.array([-1,-1,-1,-1]),[1,4])})
print(blah)

[[-1.3643465 -0.5344136]]




# Game

In [28]:
for i_ep in range(n_eps):
  # History of outcomes (states -1 = unknown, 0 = coop, 1 = defect)
  history = -np.ones([n_turns,2]) # Start
  
  # Keep track of calculated Q-values
  Q_pred1 = np.zeros([n_turns,2])
  Q_pred2 = np.zeros([n_turns,2])
  
  # Keep track of rewards
  rewards1 = np.zeros([n_turns,1])
  rewards2 = np.zeros([n_turns,1])
  
  for i_turn in range(n_turns):
    
    # Calculate Q-values
    history1 = np.array(history)
    history2 = np.array([outcome[::-1]] for outcome in history)
    
    state1 = history
    state2 = np.flip(state1,axis=1)
    '''
    state1 = np.reshape(state1,[1,n_turns,2])
    state2 = np.reshape(state2,[1,n_turns,2])
    '''
    
    # For feedforward
    state1 = np.reshape(state1,[1,n_turns*2])
    state2 = np.reshape(state2,[1,n_turns*2])

    Q1 = sess.run(Qout1, feed_dict={x1: state1})
    #Q2 = sess.run(Qout2, feed_dict={x2: state2})
    
    #Q_pred1[i_turn] = Q1
    #Q_pred2[i_turn] = Q2
    
    # Epsilon-greedy sampling; 0 = cooperate, 1 = defect
    if np.random.rand(1) < epsilon:
      action1 = np.random.randint(2)
    else:
      action1 = np.argmax(Q1)
      
    #if np.random.rand(1) < epsilon:
    #  action2 = np.random.randint(2)
    #else:
    #  action2 = np.argmax(Q2)
    if i_turn == 0 or history[i_turn-1][0] == 0:
      action2 = 0
    else:
      action2 = 1
      
    # Get rewards
    reward1 = RM[action1][action2][0]
    reward2 = RM[action1][action2][1]
    
    rewards1[i_turn] = reward1
    rewards2[i_turn] = reward2
    
    # Next state
    history_next = copy.deepcopy(history)
    history_next[i_turn] = [action1, action2]
    state1_next = np.reshape(history_next,[1,n_turns*2])
    
    # Q-learning (FWN)
    Q1_next = sess.run(Qout1, feed_dict={x1: state1_next})
    Q_target1 = reward1 + gamma * np.amax(Q1_next,axis=1,keepdims=1)
    print('blahblah')
    print(Q1_next)
    action_hist = np.reshape(history[-1][0],[1])
    
    sess.run(optimizer1, feed_dict={x1:state1, nextQ1:Q_target1, action_history:action_hist})
    
    # Update history
    history = copy.deepcopy(history_next)
    
  # Q-learning
  
  # Q_{s+1}
  if i_ep % 100 == 0:
    print(np.sum(rewards1))
    print(history[:,0])
  
  '''
  Q_pred_shift1 = np.concatenate((Q_pred1[1:,:],np.zeros([1,2])),axis=0)
  Q_target1 = rewards1 + gamma * np.amax(Q_pred_shift1,axis=1,keepdims=1)
  
  Q_pred_shift2 = np.concatenate((Q_pred2[1:,:],np.zeros([1,2])),axis=0)
  Q_target2 = rewards2 + gamma * np.amax(Q_pred_shift2,axis=1,keepdims=1)
  
  action_hist = history
  action_hist = action_hist.astype(int)
  
  sess.run(optimizer1, feed_dict={x1:state1, nextQ1:Q_target1, action_history:action_hist})
  #sess.run(optimizer2, feed_dict={x2:state2, nextQ2:Q_target2})
  '''
  
  # Epsilon-greedy schedule
  epsilon = epsilon * epsilon_dr
  if i_ep == 5000:
    print(epsilon)
    epsilon = 0
    print('------------------------------------------------')
  
#sess.close()

blahblah
[[-122.17918 -122.17923]]
blahblah
[[-100.720764 -100.92345 ]]
6.0
[0. 0.]
blahblah
[[-126.16955 -126.16942]]
blahblah
[[-159.67073 -159.20793]]
blahblah
[[-135.3163  -135.31622]]
blahblah
[[-131.24127 -131.33414]]
blahblah
[[-129.38254 -129.38245]]
blahblah
[[-129.25758 -129.35043]]
blahblah
[[-126.87004 -126.86998]]
blahblah
[[-129.14386 -129.2367 ]]
blahblah
[[-126.1559  -126.15591]]
blahblah
[[-129.83047 -129.92331]]
blahblah
[[-126.29908  -126.299095]]
blahblah
[[-130.7529  -130.84575]]
blahblah
[[-126.7807  -126.78071]]
blahblah
[[-131.64029 -131.73315]]
blahblah
[[-127.33423 -127.33424]]
blahblah
[[-132.38423 -132.47711]]
blahblah
[[-127.83757 -127.83759]]
blahblah
[[-132.96011 -133.05301]]
blahblah
[[-128.24686 -128.24686]]
blahblah
[[-133.38202 -133.47491]]
blahblah
[[-128.55734 -128.55734]]
blahblah
[[-133.67818 -133.77106]]
blahblah
[[-128.78136 -128.78139]]
blahblah
[[-133.87866 -133.97156]]
blahblah
[[-128.93665 -128.93666]]
blahblah
[[-134.00996 -134.10286]]
blah

KeyboardInterrupt: ignored

In [24]:
state1 = np.array([[-1,-1],[-1,-1]])
state1 = np.reshape(state1,[1,4])
print(state1)
Q1 = sess.run(Qout1, feed_dict={x1: state1})
print(Q1)

state1 = np.array([[-1,-1],[1,1]])
state1 = np.reshape(state1,[1,4])
print(state1)
Q1 = sess.run(Qout1, feed_dict={x1: state1})
print(Q1)

state1 = np.array([[1,-1],[-1,1]])
state1 = np.reshape(state1,[1,4])
print(state1)
Q1 = sess.run(Qout1, feed_dict={x1: state1})
print(Q1)


[[-1 -1 -1 -1]]
[[-79.157776 -79.157074]]
[[-1 -1  1  1]]
[[-71.486206 -68.51609 ]]
[[ 1 -1 -1  1]]
[[-105.11052  -105.015366]]


In [156]:
history

array([[0., 0.],
       [1., 1.]])

In [6]:
Q_1

[[0.9433724908263161, 0.5659474045908255],
 [1.0956443850857551, 2.757813922629177],
 [0.5001720490563554, 0.6315677743814965],
 [1.7888472409760967, 9.999341552075956],
 [0, 0.1]]

In [0]:
p_1 = np.exp(Q_1[outcome][0])/np.sum(np.exp(Q_1[outcome]))

In [8]:
p_1

0.00027171249332988586

In [9]:
outcome

3

#Message Network

In [12]:
#Create a message
#First position is what you did last turn: 0 = c, 1 = d
#Next n many characters are a message from each agent
#m1 is agent 1 message and m2 is agent 2 message

n = 3
m1 = np.zeros(n+2)
m2 = np.zeros(n+2)

m1[0] = action1
m2[0] = action2
m1[1] = action2
m2[2] = action1

print(m1)

[1. 1. 0. 0. 0.]
