### Rock - Paper - Scissors Deep-Q Model 

Outcome of game is a vector of lenght 6 with 4 zeros and 2 ones. 

The first 3 entries represent the humans choice, tha last 3 the computers choice. 

For example [1, 0, 0, 0, 1, 0] means Rock - Paper (Human - Computer).

In [1]:
import numpy as np
from keras import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam
from collections import deque
import random
from IPython.display import clear_output

Using TensorFlow backend.


In [2]:
n = 2              # how many steps into the past to consider
train_steps = 1000  # how many times to play the human

def randargmax(b,**kw):
    """ a random tie-breaking argmax"""
    return np.argmax(np.random.random(b.shape) * (b==b.max()), **kw)

In [3]:
class Human(object):
    """ Serves as an environment for the agent 
    """
    
    def __init__(self): 
        self.reward_table = np.array([[0, -1, 1],
                                      [1, 0, -1],
                                      [-1, 1, 0]])
    
    def step(self, action):
        # human_act = np.random.choice(3)
        human_act = input("Enter your action (0 = rock, 1 = paper, 2 = scissors)  ")
        human_act = int(human_act)
        observation = np.zeros(6)
        observation[human_act] = 1
        observation[3+action] = 1
        reward = self.reward_table[action, human_act]
        return observation, reward 

In [4]:
class Agent(object): 
    """ Rock - Paper - Scissors playing agent 
    """
    
    def __init__(self):
        
        self.memory = deque(maxlen = 50)
        
        self.epsilon = 0.5
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9
        
        self.gamma = 0.8
        self.tau = 0.125
        
        self.batch_size = 10
        self.learning_rate = 0.1
        
        self.model = self.create_model()
        self.target_model = self.create_model()
    
    def create_model(self):
        model = Sequential()
        model.add(LSTM(8, input_shape=(n,6), return_sequences=False))
        model.add(Dense(3, activation="relu"))
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        return model
    
    def act(self, state):
        _ = self.model.predict(np.expand_dims(state, axis = 0))[0]
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
        if np.random.uniform() < self.epsilon: 
            return np.random.choice(3), _
        else:
            return randargmax(self.model.predict(np.expand_dims(state, axis = 0))[0]), _
        
    def remember(self, state, action, reward, new_state):
        self.memory.append([state, action, reward, new_state])
        
    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        
        states, states_new = [np.stack([b[i] for b in batch]) for i in [0, 3]]
        actions, rewards = [np.array([b[i] for b in batch]) for i in [1, 2]]
            
        max_Q = np.max(self.target_model.predict(states_new), axis = 1)
    
        targets = self.model.predict(states)
        targets[range(self.batch_size), actions] = rewards + self.gamma * max_Q # The game goes on and on, 
                                                                           # there is no terminal state 
                                                                           # so there is always the 
                                                                           # gamma * Q expression                    
        self.model.fit(states, targets, verbose = 0)   
    
    def target_update(self): 
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)
    

In [None]:
state = np.ones((n, 6))
human = Human()
agent = Agent()
computer_won = 0
human_won = 0

for i in range(train_steps):
    clear_output(wait=True)
    action, qs = agent.act(state)
    obs, reward = human.step(action)
    
    computer_won += max(reward, 0)
    human_won += max(-reward, 0)
    last_game_outcome = ["human won", "tie", "computer won"][reward+1]
    print(qs)
    print(last_game_outcome)
    print("Game: ", i)
    print("Computer won: ", computer_won)
    print("Human won: ", human_won)
    print("-----------------------------------------------------")
    state_new = np.concatenate([state, np.expand_dims(obs, axis=0)])[1:]
    agent.remember(state, action, reward, state_new)
    agent.replay()
    agent.target_update()
    
    

[1.3951781  0.46694255 0.        ]
computer won
Game:  18
Computer won:  11
Human won:  5
-----------------------------------------------------
