In [1]:
#Import Libraries
import gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
#Setting up agent and environment
class agent:
    def __init__(self):
        self.env = gym.make("MountainCar-v0")
        self.env._max_episode_steps = 1000
        self.pos_space = np.linspace(-1.2,0.6,20)
        self.vel_space = np.linspace(-.07,.07,20)
        self.action_space = [0,1,2]
        self.state_space = []
        self.Q = {}
        self.returns = {}
        self.visited = {}
        self.policy = {}
        
        for pos_bin in range(len(self.pos_space)+1):
            for vel_bin in range(len(self.vel_space)+1):
                for action in self.action_space:
                    self.Q[((pos_bin,vel_bin),action)] = 0
                    self.returns[((pos_bin,vel_bin),action)] = 0
                    self.visited[((pos_bin,vel_bin),action)] = 0
                self.state_space.append((pos_bin,vel_bin))
        
        for state in self.state_space:
            self.policy[state] = np.random.choice(self.action_space)
                
    def get_state(self,observation):
        pos,vel = observation
        pos_bin = np.digitize(pos,self.pos_space)
        vel_bin = np.digitize(vel,self.vel_space)
        
        return pos_bin,vel_bin
    
    def max_action(self,state):
        values = np.array([self.Q[state,a] for a in self.action_space])
        action = np.argmax(values)
        return action
    

In [3]:
#hyperparameters
agent = agent()
num_episodes = 100000
alpha = 0.1
gamma = 0.99
eps = 1.00

In [4]:
total_reward = np.zeros(num_episodes)
#Algorithm starts
for i in range(num_episodes):
    obs = agent.env.reset()
    done = False
    if i%1000 == 0 and i>0:
        print("starting episode ",i,"score ",score,"eps ",eps)
        file = open("rewards_sarsa",'wb')
        pickle.dump(total_reward,file)
        file.close()

        # file = open("policy_sarsa",'wb')
        # pickle.dump(agent.policy,file)
        # file.close()

        file = open("Q_sarsa",'wb')
        pickle.dump(agent.Q,file)
        file.close()
    score = 0
    state = agent.get_state(obs)
    action = agent.max_action(state) if np.random.random() > eps \
                                    else agent.env.action_space.sample()
    while not done:
        obs_,reward,done,info = agent.env.step(action)
        state_ = agent.get_state(obs_)
        action_ = agent.max_action(state_) if np.random.random() > eps \
                                    else agent.env.action_space.sample()
        score += reward
        agent.Q[(state,action)] += alpha*(reward+gamma*agent.Q[(state_,action_)]-agent.Q[(state,action)])
        state = state_
        action = action_
    total_reward[i] = score
    eps = eps-4/num_episodes if eps > 0.01 else 0.01
#     if eps - 1e-7 > 0:
#         eps -= 1e-7
#     else:
#         eps = 0    
    

starting episode  1000 score  -1000.0 eps  0.95999999999996
starting episode  2000 score  -1000.0 eps  0.91999999999992
starting episode  3000 score  -1000.0 eps  0.87999999999988
starting episode  4000 score  -1000.0 eps  0.83999999999984
starting episode  5000 score  -486.0 eps  0.7999999999998
starting episode  6000 score  -809.0 eps  0.75999999999976
starting episode  7000 score  -437.0 eps  0.71999999999972
starting episode  8000 score  -674.0 eps  0.67999999999968
starting episode  9000 score  -497.0 eps  0.63999999999964
starting episode  10000 score  -525.0 eps  0.5999999999996
starting episode  11000 score  -242.0 eps  0.55999999999956
starting episode  12000 score  -346.0 eps  0.51999999999952
starting episode  13000 score  -436.0 eps  0.47999999999950776
starting episode  14000 score  -247.0 eps  0.4399999999995233
starting episode  15000 score  -197.0 eps  0.3999999999995388
starting episode  16000 score  -322.0 eps  0.3599999999995543
starting episode  17000 score  -250.0 