In [1]:
#Import Libraries
import gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
#Setting up agent and environment
class agent:
    def __init__(self):
        self.env = gym.make("MountainCar-v0")
        self.env._max_episode_steps = 1000
        self.pos_space = np.linspace(-1.2,0.6,20)
        self.vel_space = np.linspace(-.07,.07,20)
        self.action_space = [0,1,2]
        self.state_space = []
        self.Q = {}
        self.returns = {}
        self.visited = {}
        self.policy = {}
        
        for pos_bin in range(len(self.pos_space)+1):
            for vel_bin in range(len(self.vel_space)+1):
                for action in self.action_space:
                    self.Q[((pos_bin,vel_bin),action)] = 0
                    self.returns[((pos_bin,vel_bin),action)] = 0
                    self.visited[((pos_bin,vel_bin),action)] = 0
                self.state_space.append((pos_bin,vel_bin))
        
        for state in self.state_space:
            self.policy[state] = np.random.choice(self.action_space)
                
        
        
    def get_state(self,observation):
        pos,vel = observation
        pos_bin = np.digitize(pos,self.pos_space)
        vel_bin = np.digitize(vel,self.vel_space)
        
        return pos_bin,vel_bin
    
    

In [3]:
#hyperparameters
agent = agent()
gamma = 1
eps = 1.00
num_episode = 100000

In [4]:
rewards = np.zeros(num_episode)
#Algorithm starts
for i in range(num_episode):
    stateActionReturns = []
    memory = []
    if i%1000 == 0 and i>0:
        print("starting episode ",i,"score ",score,"eps ",eps)
        file = open("rewards_montecarlo",'wb')
        pickle.dump(rewards,file)
        file.close()

        file = open("policy_montecarlo",'wb')
        pickle.dump(agent.policy,file)
        file.close()

        file = open("Q_montecarlo",'wb')
        pickle.dump(agent.Q,file)
        file.close()
    observation = agent.env.reset()
    observation = agent.get_state(observation)
    done = False
    score = 0
    while not done:
        action = agent.policy[observation]
        observation_,reward,done,info = agent.env.step(action)
        score += reward
        observation_ = agent.get_state(observation_)
        memory.append((observation[0],observation[1],action,reward))
        observation = observation_
    memory.append((observation[0],observation[1],action,reward))
    rewards[i] = score
    G = 0
    last = True
    for pos_bin,vel_bin,action,reward in reversed(memory):
        if last:
            last = False
        else:
            stateActionReturns.append((pos_bin,vel_bin,action,G))
        G = gamma*G + reward
    
    stateActionReturns.reverse()
    stateActionsVisited = []
    
    for pos_bin,vel_bin,action,G in stateActionReturns:
        sa = ((pos_bin,vel_bin),action)
        if sa not in stateActionsVisited:
            agent.visited[sa] += 1
            
            agent.returns[sa] += (1/agent.visited[sa])*(G - agent.returns[sa])
            agent.Q[sa] = agent.returns[sa]
            rand = np.random.random()
            if rand < 1-eps:
                state = (pos_bin,vel_bin)
                values = np.array([agent.Q[(state,a)] for a in agent.action_space])
                best = np.random.choice(np.where(values == values.max())[0])
                agent.policy[state] = agent.action_space[best]
            else:
                state = (pos_bin,vel_bin)
                agent.policy[state] = np.random.choice(agent.action_space)
            stateActionsVisited.append(sa)
    eps = eps-4/num_episode if eps > 0.01 else 0.01
#     if eps - 1e-7 > 0:
#         eps -= 1e-7
#     else:
#         eps = 0
    
            

starting episode  1000 score  -1000.0 eps  0.95999999999996
starting episode  2000 score  -1000.0 eps  0.91999999999992
starting episode  3000 score  -1000.0 eps  0.87999999999988
starting episode  4000 score  -1000.0 eps  0.83999999999984
starting episode  5000 score  -1000.0 eps  0.7999999999998
starting episode  6000 score  -1000.0 eps  0.75999999999976
starting episode  7000 score  -1000.0 eps  0.71999999999972
starting episode  8000 score  -1000.0 eps  0.67999999999968
starting episode  9000 score  -389.0 eps  0.63999999999964
starting episode  10000 score  -284.0 eps  0.5999999999996
starting episode  11000 score  -1000.0 eps  0.55999999999956
starting episode  12000 score  -1000.0 eps  0.51999999999952
starting episode  13000 score  -173.0 eps  0.47999999999950776
starting episode  14000 score  -1000.0 eps  0.4399999999995233
starting episode  15000 score  -162.0 eps  0.3999999999995388
starting episode  16000 score  -249.0 eps  0.3599999999995543
starting episode  17000 score  

In [5]:
# #Test policy

# num_eps = 2000
# rewards = np.zeros(num_eps)
# totalReward = 0
# for i in range(num_eps):
#     observation = agent.env.reset()
#     observation = agent.get_state(observation)
#     done = False
#     while not done:
#         action = agent.policy[observation]
#         observation_,reward,done,info = agent.env.step(action)
#         observation_ = agent.get_state(observation_)
#         observation = observation_
#         totalReward += reward
#     rewards[i] = totalReward
#     totalReward = 0