### Bonus Part 1: CartPole-v1

In [None]:
print ('Submitted By')
print ('UBITname      = karanman')
print ('Person Number = 50290755')

### imports

In [None]:
# for importing these packages you need to have the following packages installed on your system:
# gym
# cmake
# openmpi
# zlib
# stable-baselines
# ffmpeg

#importing packages
import gym

from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy

import numpy as np
import os
import matplotlib.pyplot as plt

### Callback

In [None]:
best_mean_reward, n_steps = -np.inf, 0

def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global n_steps, best_mean_reward
    # Print stats every 1000 calls
    if (n_steps + 1) % 1000 == 0:
        # Evaluate policy performance
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
    n_steps += 1
    return False

In [None]:
# Create log dir
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

In [None]:
env_id = 'CartPole-v1'
video_folder = '1_ipynb_animation/'
video_length = 100

In [None]:
# initialize environment
env = gym.make(env_id)
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="random-agent-{}".format(env_id))

In [None]:
# initialize model
model = DQN(MlpPolicy, env, verbose=1)

# train the model
model.learn(total_timesteps=25000, callback=callback)

In [None]:
obs = env.reset()
for i in range(video_length + 1):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    i+=1

### Plotting helper functions

In [None]:
def movingAverage(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    y = movingAverage(y, window=50)
    # Truncate x
    x = x[len(x) - len(y):]

    fig = plt.figure(title)
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")
    plt.show()


In [None]:
plot_results(log_dir)