In [1]:
import pixiedust

Pixiedust database opened successfully


In [3]:
import gym
from gym import wrappers
import numpy as np
import pandas as pd
import time

In [1]:
# %%pixie_debugger

start_time = time.time()
env = gym.make("Taxi-v2")
env = wrappers.Monitor(env, "./results", force=True)

Q = np.zeros([env.observation_space.n, env.action_space.n])
n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

labels = ['episode', 'avg_total_score', 'delta', 'epsilon']
df = pd.DataFrame(columns=labels)
df[labels] = df[labels].astype(float)
print(df)

NUM_EPISODES = 10000000
epsilon = 0.2 #Epsilon controls the probability of taking a random action vs following policy
rewards_list = []

last_avg_total_score, avg_total_score = 0, 0

def report(avg_total_score, last_avg_total_score, rewards_list, episode_interval, epsilon, df, df_counter):
    last_avg_total_score = avg_total_score
    avg_total_score = sum(rewards_list) / (episode_interval)
    delta = avg_total_score - last_avg_total_score
    print(f"Episode: {i:>7} Average Total Score: {avg_total_score:>10.3f} delta: {delta:>10.3f} epsilon {epsilon:>10.3f} ")
    df.loc[df_counter] = [i, avg_total_score, delta, epsilon]
    df_counter += 1
    return avg_total_score, last_avg_total_score, delta, df_counter

def get_epsilon(i, episode_interval, num_episodes):
    if i > NUM_EPISODES * 0.95:
        return 0.0
    else:
        return 0.3 * (1 - (i / num_episodes)) + 0.2
    return epsilon

def main(NUM_EPISODES=10000, epsilon=0.2, rewards_list=[]):
    df_counter = 0
    for i in range(NUM_EPISODES + 1):

        state = env.reset()
        r_all = 0 # sum of all rewards
        done = False
        results_list = []
        results_sum = 0.0
        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state, :])
            new_state, reward, done, _ = env.step(action)
            results_list.append((state, action))
            results_sum += reward
            state = new_state
            r_all += reward
        rewards_list.append(r_all)

        for (state, action) in results_list:
            n_s_a[state, action] += 1.0
            alpha = 1.0 / n_s_a[state, action]
            Q[state, action] += alpha * (results_sum - Q[state, action])

        episode_interval = 500
        if i % episode_interval == 0 and i is not 0:
            epsilon = get_epsilon(i, episode_interval, NUM_EPISODES)

            last_avg_total_score = avg_total_score
            avg_total_score, last_avg_total_score, delta, df_counter = report(
                avg_total_score, last_avg_total_score, rewards_list, episode_interval, epsilon, df, df_counter) 
            rewards_list = []

    env.close()

pixiedust.display(df)
end_time = time.time()
print(f"Time taken: {end_time - start_time:10.2f} seconds")



NameError: name 'time' is not defined

In [6]:
import seaborn

pixiedust.display(df)

---

In [4]:
# %%pixie_debugger
# imports
import gym
from gym import wrappers
import numpy as np
import pandas as pd
import pprint
import time

# Helper Functions
def update_report(report):
    r = report
    r['last_avg_total_score'] = r['avg_total_score']
    r['avg_total_score'] = sum(r['rewards_list']) / (r['episode_interval'])
    
    # calculate delta
    delta = r['avg_total_score'] - r['last_avg_total_score']
    
    print(f"Episode: {r['i']:>7} Average Total Score: {r['avg_total_score']:>10.3f}"\
          f" delta: {delta:>10.3f} epsilon: {r['epsilon']:>10.3f} ")
    
    # add to dataframe
    r['df'].loc[r['df_index']] = [r['i'], r['avg_total_score'], delta, r['epsilon']]
    r['df_index'] += 1
    
    # re-initialize rewards_list - allows us to only capture last episode_interval (500) runs
    r['rewards_list'] = []


def get_epsilon(target_epsilon, i, episode_interval, num_episodes):
    if i > num_episodes * 0.95:
        return 0.0
    else:
        return 0.3 * (1 - (i / num_episodes)) + target_epsilon


# Main
def main(num_episodes=10000, target_epsilon=0.2):
    start_time = time.time()
    
    # environment
    env = gym.make("Taxi-v2")
    env = wrappers.Monitor(env, "./results", force=True)

    # initialize variables
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

    labels = ['episode', 'avg_total_score', 'delta', 'epsilon']
    report = {
        'rewards_list' : [],
        'avg_total_score' : 0,
        'last_avg_total_score' : 0,
        'df': pd.DataFrame(columns=labels),
        'df_index': 0,
        'episode_interval': 500,
    }
    
    # set df labels type
    report['df'][labels] = report['df'][labels].astype(float)
    print('Initial Dataframe:')
    print(report['df'])
    
    
    for i in range(num_episodes + 1):

        state = env.reset()
        r_all = 0 # sum of all rewards
        done = False
        results_list = []
        results_sum = 0.0
        
        while not done:
            if np.random.rand() < target_epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state, :])
            new_state, reward, done, _ = env.step(action)
            results_list.append((state, action))
            results_sum += reward
            state = new_state
            r_all += reward
        
        report['rewards_list'].append(r_all)

        for (state, action) in results_list:
            n_s_a[state, action] += 1.0
            alpha = 1.0 / n_s_a[state, action]
            Q[state, action] += alpha * (results_sum - Q[state, action])

        
        if i % report['episode_interval'] == 0 and i is not 0:
            report['i'] = i
            report['epsilon'] = get_epsilon(target_epsilon, i, report['episode_interval'], num_episodes)
            
            # update report
            update_report(report)

    env.close()
    
    end_time = time.time()
    run_time = end_time - start_time
    print(f"Time taken: {end_time - start_time:10.2f} seconds")
    return (report['df'], run_time)


### Episodes: 10000 Target_epsilon=0.2 

In [5]:
# Epsilon controls the probability of taking a random action vs following policy
num_episodes = 10000
target_epsilon = 0.2

df, run_time = main(num_episodes=num_episodes, target_epsilon=target_epsilon)
print(f'Data Frame Size: {df.shape}')

Initial Dataframe:
Empty DataFrame
Columns: [episode, avg_total_score, delta, epsilon]
Index: []
Episode:     500 Average Total Score:   -531.812 delta:   -531.812 epsilon:      0.485 
Episode:    1000 Average Total Score:   -345.806 delta:    186.006 epsilon:      0.470 
Episode:    1500 Average Total Score:   -331.538 delta:     14.268 epsilon:      0.455 
Episode:    2000 Average Total Score:   -320.710 delta:     10.828 epsilon:      0.440 
Episode:    2500 Average Total Score:   -321.302 delta:     -0.592 epsilon:      0.425 
Episode:    3000 Average Total Score:   -320.356 delta:      0.946 epsilon:      0.410 
Episode:    3500 Average Total Score:   -314.930 delta:      5.426 epsilon:      0.395 
Episode:    4000 Average Total Score:   -316.346 delta:     -1.416 epsilon:      0.380 
Episode:    4500 Average Total Score:   -314.218 delta:      2.128 epsilon:      0.365 
Episode:    5000 Average Total Score:   -314.496 delta:     -0.278 epsilon:      0.350 
Episode:    5500 Averag

In [6]:
pixiedust.display(df)

---

# Previous Attempts

### Monte Carlo GLIE: 2
Applied to Taxi-v2 environment

In [4]:
# import necessary libraries
import gym
from gym import wrappers
import numpy as np

env = gym.make("Taxi-v2")
env = wrappers.Monitor(env, "./results", force=True)

Q = np.zeros([env.observation_space.n, env.action_space.n])
n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

# Helper Functions
NUM_EPISODES = 2000
EPSILON = 0.2 # Epsilon controls the probability of taking a random action vs. following known policy
rewards_list = []
last_success_rate = 0
success_rate = 0

for i in range(NUM_EPISODES):
    state = env.reset()
    r_all = 0 # sum of all rewards
    done = False
    results_list = []
    result_sum = 0.0
    
    while not done:
        
        if np.random.rand() < EPSILON:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        #print(f"New State:{new_state}|Reward:{reward}|done?:{done}|_:{_}")
        results_list.append((state, action))
        result_sum += reward
        state = new_state
        r_all += reward
    rewards_list.append(r_all)
    
    for (state, action) in results_list:
        n_s_a[state, action] += 1.0
        alpha = 1.0 / n_s_a[state, action]
        Q[state, action] += alpha * (result_sum - Q[state, action])
    
    if i % 50 == 0 and i is not 0:
        last_success_rate = success_rate
        success_rate = sum(rewards_list)/i
        delta = success_rate - last_success_rate
        #print("{0}/{1} - Success rate: {2}".format(i, NUM_EPISODES, success_rate, delta))
        print(f"Episode: {i:>7}/{NUM_EPISODES} Success rate: {success_rate:2.6f} delta: {delta:2.6f}")
        
success_rate = sum(rewards_list) / NUM_EPISODES
print("Success rate: {0}".format(success_rate))

env.close()

Episode:      50/2000 Success rate: -392.460000 delta: -392.460000
Episode:     100/2000 Success rate: -510.260000 delta: -117.800000
Episode:     150/2000 Success rate: -562.540000 delta: -52.280000
Episode:     200/2000 Success rate: -556.140000 delta: 6.400000
Episode:     250/2000 Success rate: -586.936000 delta: -30.796000
Episode:     300/2000 Success rate: -582.536667 delta: 4.399333
Episode:     350/2000 Success rate: -563.857143 delta: 18.679524
Episode:     400/2000 Success rate: -548.120000 delta: 15.737143
Episode:     450/2000 Success rate: -538.680000 delta: 9.440000
Episode:     500/2000 Success rate: -523.586000 delta: 15.094000
Episode:     550/2000 Success rate: -513.609091 delta: 9.976909
Episode:     600/2000 Success rate: -500.810000 delta: 12.799091
Episode:     650/2000 Success rate: -490.326154 delta: 10.483846
Episode:     700/2000 Success rate: -480.774286 delta: 9.551868
Episode:     750/2000 Success rate: -472.376000 delta: 8.398286
Episode:     800/2000 Suc

### Monte Carlo GLIE [Original]
source: https://gist.github.com/Sathishruw/d609e358b61268cdf891cc93e15e5f63

Applied to FrozenLake-v0 environment

In [2]:
# import necessary libraries
import gym
from gym import wrappers
import numpy as np

In [20]:
env = gym.make("FrozenLake-v0")
env = wrappers.Monitor(env, "./results", force=True)

Q = np.zeros([env.observation_space.n, env.action_space.n])
n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

NUM_EPISODES = 100000
EPSILON = 0.2 # Epsilon controls the probability of taking a random action vs. following known policy
rewards_list = []
last_success_rate = 0
success_rate = 0

for i in range(NUM_EPISODES):
    state = env.reset()
    r_all = 0 # sum of all rewards
    done = False
    results_list = []
    result_sum = 0.0
    
    while not done:
        if np.random.rand() < EPSILON:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        results_list.append((state, action))
        result_sum += reward
        state = new_state
        r_all += reward
    rewards_list.append(r_all)
    
    for (state, action) in results_list:
        n_s_a[state, action] += 1.0
        alpha = 1.0 / n_s_a[state, action]
        Q[state, action] += alpha * (result_sum - Q[state, action])
    
    if i % 5000 == 0 and i is not 0:
        last_success_rate = success_rate
        success_rate = sum(rewards_list)/i
        delta = success_rate - last_success_rate
        #print("{0}/{1} - Success rate: {2}".format(i, NUM_EPISODES, success_rate, delta))
        print(f"Episode: {i:>7}/{NUM_EPISODES} Success rate: {success_rate:2.6f} delta: {delta:2.6f}")
        
success_rate = sum(rewards_list) / NUM_EPISODES
print("Success rate: {0}".format(success_rate))

env.close()

Episode:    5000/100000 Success rate: 0.045000 delta: 0.045000
Episode:   10000/100000 Success rate: 0.055700 delta: 0.010700
Episode:   15000/100000 Success rate: 0.061400 delta: 0.005700
Episode:   20000/100000 Success rate: 0.064450 delta: 0.003050
Episode:   25000/100000 Success rate: 0.068360 delta: 0.003910
Episode:   30000/100000 Success rate: 0.073333 delta: 0.004973
Episode:   35000/100000 Success rate: 0.078371 delta: 0.005038
Episode:   40000/100000 Success rate: 0.081475 delta: 0.003104
Episode:   45000/100000 Success rate: 0.085156 delta: 0.003681
Episode:   50000/100000 Success rate: 0.087700 delta: 0.002544
Episode:   55000/100000 Success rate: 0.090364 delta: 0.002664
Episode:   60000/100000 Success rate: 0.091917 delta: 0.001553
Episode:   65000/100000 Success rate: 0.092708 delta: 0.000791
Episode:   70000/100000 Success rate: 0.094300 delta: 0.001592
Episode:   75000/100000 Success rate: 0.095280 delta: 0.000980
Episode:   80000/100000 Success rate: 0.096062 delta: 0