<a href="https://colab.research.google.com/github/janekkp/CartPole-v0/blob/colab/CartPole_v0_Q_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import gym
import random
from collections import defaultdict


In [0]:
class Qlearn_agent():
  def __init__(self, env = gym.make('CartPole-v0'), alpha = 0.15, num_of_episodes = 1000, gamma = 1, epsilon = 1):
    self.env = env
    self.alpha = alpha
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_decay = 0.999
    self.num_of_episodes = num_of_episodes
    self.scores = []

    self.num_of_actions = self.env.action_space.n
    self.Q = defaultdict(lambda: np.zeros(self.num_of_actions))
    
    self.range_cart_position = np.linspace(-2.4, 2.4, num = 1)
    self.range_cart_velocity = np.linspace(-10, 10, num = 1)
    self.range_pole_angle = np.linspace(-15, 15, num = 800)
    self.range_pole_velocity = np.linspace(-20, 20, num = 90)
    
  def discretize(self, state):
    #Converting continous space into discrete
    cart_position, cart_velocity, pole_angle, pole_velocity = state
    
    cart_position = np.digitize(cart_position, self.range_cart_position)
    cart_velocity = np.digitize(cart_velocity, self.range_cart_velocity)
    pole_angle = np.digitize(pole_angle, self.range_pole_angle)
    pole_velocity = np.digitize(pole_velocity, self.range_pole_velocity)
        
    state = cart_position, cart_velocity, pole_angle, pole_velocity
    
    return state
  

  def train(self, num_of_episodes):
    #Training stage, where agent picks actions randomly. 
    #Update of q-values is done with standard Q-learing algorithm
    for episode in range(self.num_of_episodes):      
      s = self.discretize(self.env.reset())
      self.epsilon = self.epsilon*self.epsilon_decay
      while True:
        action = self.env.action_space.sample() if (random.random() <= self.epsilon) else np.argmax(self.Q[s])
        next_state, reward, end, _ = self.env.step(action)
        next_state = self.discretize(next_state)
        if end:
          self.Q[s][action] = reward
          break
        else:
          self.Q[s][action] += self.alpha * (reward + self.gamma * np.max(self.Q[next_state]) - self.Q[s][action])
          s = next_state
        
    return self.Q
  
  def play(self, state):
    #At this stage agent picks learned actions
    s = self.discretize(state)
    return np.argmax(self.Q[s])
  
  def ewaluate_playing(self):
    #CartPole is solved if mean score over 100 consecutive trials is >= 195
    for i in range(100):
      score = 0
      s = self.discretize(self.env.reset())
      end = False
      while not end:
        a = self.play(s)
        s, r, end, _ = self.env.step(a)
        score += r
        if end:
          self.scores.append(score)
          break
      return np.mean(self.scores)
    


  

In [379]:
def main():
  player = Qlearn_agent()
  player.train(player.num_of_episodes)
  print('Qlearn_agent obtained {} score over 100 consecutive trials, after {} episodes of training'.format(player.ewaluate_playing(), player.num_of_episodes))
  
if __name__ == '__main__':
  main()
    

Qlearn_agent obtained 200.0 score over 100 consecutive trials, after 1000 episodes of training
