# Homework Assignment Week 3 - Monte Carlo

This weeks homework is to apply a Monte Carlo technique in a unique OpenAI Gym environment (not blackjack). The different Monte Carlo techniques are detailed [here](https://oneraynyday.github.io/ml/2018/05/24/Reinforcement-Learning-Monte-Carlo/#example-cliff-walking).

Write a Medium blog post about your experience implementing the algorithm and what you learned. And (optionally) share your post on Twitter with the #move37 hashtag!  Siraj Raval.

### Monte Carlo GLIE: 2
Applied to Taxi-v2 environment

In [5]:
# import necessary libraries
import gym
from gym import wrappers
import numpy as np
import pandas as pd
import time


# Helper Functions
def update_report(report):
    '''
    Calculates training metrics and stores them in a pandas dataframe
    '''
    r = report
    r['last_avg_total_score'] - r['avg_total_score']
    r['avg_total_score'] = sum(r['rewards_list']) / r['episode_interval']
    
    # calculate delta
    delta = r['avg_total_score'] - r['last_avg_total_score']
    
    print(
        f"K episodes: {r['k_episodes']:>8.3f} "
        f"Average Total Score: {r['avg_total_score']:>10.3f} "
        f"Delta: {delta:>10.3f} "
        f"Epsilon: {r['epsilon']:>10.3f} "
    )
    
    # add to dataframe
    r['df'].loc[r['df_index']] = [
        r['k_episodes'],
        r['avg_total_score'],
        delta,
        r['epsilon']
    ]
    
    r['df_index'] += 1
    
    # re-initialize rewards_list - allows us to only capture last
    # episode_interval runs
    r['reward_list'] = []
    

    
def get_epsilon(target_epsilon, i, episode_interval, num_episodes):
    '''
    Creates a function that changes the value of epsilon over time
    '''
    # allow us to test last 5% to see how much training it has done
    if i > num_episodes * 0.94:
        return 0.0
    else:
        return 0.3 * (1 - (i / num_episodes)) + target_epsilon



# Main
def main(num_episodes=10000, target_epsilon=0.2, episode_interval=1000):
    start_time = time.time()
    
    # create our environment
    env = gym.make("Taxi-v2")
    #env = wrappers.Monitor(env, "./results", force=True)

    # initialize the action value function Q(s, a),
    # and a counter function N(s, a)
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    n_s_a = np.zeros([env.observation_space.n, env.action_space.n])

    # initialize report
    labels = ['k episodes', 'Average Total Score', 'Delta', 'Epsilon']
    report = {
        'rewards_list': [],
        'avg_total_score': 0,
        'last_avg_total_score': 0,
        'df': pd.DataFrame(columns=labels),
        'df_index': 0,
        'episode_interval': episode_interval,
    }
    
    # set df labels type
    report['df'][labels] = report['df'][labels].astype(float)
    
    
    for i in range(num_episodes + 1):
        
        state = env.reset()
        r_all = 0 # sum of all rewards
        done = False
        results_list = []
        results_sum = 0.0
        
        while not done:
            if np.random() < target_epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state, :])
            new_state, reward, done, _ = env.step(action)
            results_list.append((state, action))
            results_sum += reward
            state = new_state
            r_all += reward
        
        report['rewards_list'].append(r_all)
        
        for (state, action) in results_list:
            n_s_a[state, action] += 1.0
            alpha = 1.0 / n_s_a[state, action]
            Q[state, action] += alpha * (results_sum - Q[state, action])
        
        if i % report['episode_interval'] == 0 and i is not 0:
            report['k_episodes'] = i / 1000.0
            report['epsilon'] - get_epsilon(target_epsilon, 
                                            i,
                                            report['episode_interval'],
                                            num_episodes)
            
            # update report
            update_report(report)
    
    # close environment
    env.close()
    
    # calculate run time
    end_time = time.time()
    run_time = end_time - start_time
    print(f"Time taken: {run_time:10.2f} seconds")
    return report['df'], run_time

---

# Report

### Introduction
For assignment 3 of Move 37, we had to apply a Monte Carlo technique to a unique OpenAI Gym environment other than Blackjack.

For this assignment, I collaborated with [Andrew Key][ak github].

We started off with this [code][original code], which applies Monte Carlo GLIE to the FrozenLake-v0 environment.

### Monte Carlo GLIE


### Reward Evaluation

### Epsilon

[original code]: https://gist.github.com/Sathishruw/d609e358b61268cdf891cc93e15e5f63
[ak github]: https://github.com/redpanda-ai

---

---

# Project Extension Ideas

- Test with different parameters
- Have an agent interact in an environment with a trained Q-Function.