<a href="https://colab.research.google.com/github/hwangsaeyeon/AAI-Web-Development/blob/main/assignmnet2/taxi_v3_modelfree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym
!pip install gym[toy_text]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 96 kB/s 
Installing collected packages: pygame
Successfully installed pygame-2.1.0


In [5]:
1e-3

0.001

In [20]:
import numpy as np
from collections import defaultdict
class Agent:

    def __init__(self, Q, mode="test_mode"):
        self.Q = Q
        self.mode = mode
        self.n_actions = 6

        self.epsilon = 0 #greedy action  
        self.N = defaultdict(lambda: np.zeros(self.n_actions))

        if self.mode == 'mc_control': 
            self.epsilon = 1 
            self.gamma = 0.9
            self.sample = list()
            self.alpha = 0.1
            self.k = 1
    
        if self.mode == 'q_learning':
            self.epsilon = 1
            self.gamma = 0.9 
            self.alpha = 1 


    def select_action(self, state):
        """
        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """  
        #select an action a from state s(e.g. epsilon-greedy) and execute it
        if np.random.random() < self.epsilon : 
            action = np.random.choice(self.n_actions) #exploration 
        else:
            action = np.argmax(self.Q[state]) #exploitation

        return action 
        #return action

    def step(self, state, action, reward, next_state, done):

        """
        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """

        if self.mode =='mc_control': 
            '''
            sample k_th episode using Policy : {S_1,A_1,R_2,...,S_t} 
            if done: 
                for each state S_t and action A_t in the episode:
                    N(S,A) <- N(S,A) + 1
                    Gt = R_t+1 + r*R_t+2 + ... + r^T-1*R_terminal
                    Q(S,A) <- Q(S,A) + 1/N(S,A)*(Gt - Q(S,A)) 
                epsilon <- 1/k 
                policy <- epsilon-greedy(Q)
            '''
            
            self.sample.append([state, action, reward])
            if done:   
                G = defaultdict(lambda: np.zeros(self.n_actions))
                for state, action, reward in reversed(self.sample): 
                    self.N[state][action] += 1 
                    G[state][action] = reward + self.gamma * G[state][action] 
                    self.Q[state][action] += self.alpha*((1/self.N[state][action])*(G[state][action] - self.Q[state][action])) 
                self.epsilon = 1 / self.k
                self.k += 0.001 
                self.sample = list()
       
        if self.mode == 'q_learning':
            #Q(S,A) <- Q(S,A) + alpha[R + gamma*maxQ(S’,a) - Q(S,A)] 
            self.Q[state][action] =  self.Q[state][action] + self.alpha*(reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action])
            self.epsilon *= 0.99 



In [21]:

import gym
from collections import deque
import sys
from collections import defaultdict
import numpy as np
#from agent import Agent

env = gym.make('Taxi-v3')

action_size = env.action_space.n
print("Action Space", env.action_space.n)
print("State Space", env.observation_space.n)

def testing_without_learning():
    state = env.reset()
    total_rewards = 0

    def decode(i):
        out = []
        out.append(i % 4)
        i = i // 4
        out.append(i % 5)
        i = i // 5
        out.append(i % 5)
        i = i // 5
        out.append(i)
        return reversed(out)

    while True:
        env.render()
        print(list(decode(state)))
        print("0:down, 1:up, 2:right, 3:left, 4:pick, 5:dropoff")
        action = int(input("select action: "))
        while action not in [0,1,2,3,4,5]:
            action = int(input("select action: "))
        next_state, reward, done, _ = env.step(action)
        print("reward:", reward)
        total_rewards = total_rewards + reward
        if done:
            print("total reward:", total_rewards)
            break
        state = next_state



def model_free_RL(Q, mode):
    agent = Agent(Q, mode)
    num_episodes = 100000
    last_100_episode_rewards = deque(maxlen=100)
    for i_episode in range(1, num_episodes+1):

        state = env.reset()
        episode_rewards = 0

        while True:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            episode_rewards += reward
            if done:
                last_100_episode_rewards.append(episode_rewards)
                break

            state = next_state

        if (i_episode >= 100):
            last_100_episode_rewards.append(episode_rewards)
            avg_reward = sum(last_100_episode_rewards) / len(last_100_episode_rewards)
            print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, avg_reward), end="")

    print()


def testing_after_learning(Q, mode):
    agent = Agent(Q, mode)
    n_tests = 100 
    total_test_rewards = []
    for episode in range(n_tests):
        state = env.reset()
        episode_reward = 0

        while True:
            action = agent.select_action(state)
            new_state, reward, done, _ = env.step(action)
            episode_reward += reward
    

            if done:
                total_test_rewards.append(episode_reward)
                break

            state = new_state
    print(total_test_rewards)
    print("avg: " + str(sum(total_test_rewards) / n_tests))


Q = defaultdict(lambda: np.zeros(action_size))
while True:
    print()
    print("1. testing without learning")
    print("2. MC-control")
    print("3. q-learning")
    print("4. testing after learning")
    print("5. exit")
    menu = int(input("select: "))
    if menu == 1:
        testing_without_learning()
    elif menu == 2:
        Q = defaultdict(lambda: np.zeros(action_size))
        model_free_RL(Q, "mc_control")
    elif menu == 3:
        Q = defaultdict(lambda: np.zeros(action_size))
        model_free_RL(Q, "q_learning")
    elif menu == 4:
        testing_after_learning(Q, "test_mode")
    elif menu == 5:
        break
    else:
        print("wrong input!")

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


Action Space 6
State Space 500

1. testing without learning
2. MC-control
3. q-learning
4. testing after learning
5. exit
select: 2
Episode 100000/100000 || Best average reward 6.96

1. testing without learning
2. MC-control
3. q-learning
4. testing after learning
5. exit
select: 4
[8, 8, 7, 4, 4, 6, 7, 8, 7, 8, 5, 6, 9, 11, 5, 10, 9, 8, 12, 8, 8, 8, 7, 7, 9, 10, 5, 9, 5, 13, 8, 5, 7, 5, 4, 13, 5, 3, 8, 10, 3, 6, 10, 7, 1, 7, 8, 3, 4, 10, 11, 9, 10, 4, 14, 6, 5, 1, 9, 8, 4, 4, 7, 5, 9, 2, 8, 8, 9, 7, 12, 5, 6, 5, 0, 11, 6, 11, 10, 6, 7, 13, 7, 4, 7, 9, 6, 13, 8, 3, 4, 9, 5, 1, 6, -1, 1, 9, 9, 7]
avg: 6.97

1. testing without learning
2. MC-control
3. q-learning
4. testing after learning
5. exit
select: 3
Episode 100000/100000 || Best average reward 7.5

1. testing without learning
2. MC-control
3. q-learning
4. testing after learning
5. exit
select: 4
[5, 8, 4, 7, 12, 6, 5, 13, 3, 7, 10, 5, 9, 5, 11, 6, 9, 7, 7, 8, 8, 4, 6, 6, 10, 7, 7, 11, 11, 5, 12, 6, 7, 7, 11, 3, 11, 7, 3, 8, 4, 10