In [4]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline

In [5]:
env = gym.make('CartPole-v1')
env.seed(1);

In [6]:
#Hyperparameters
learning_rate = 0.01
gamma = 0.99

In [7]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.l1 = nn.Linear(self.state_space, 128, bias=False)
        self.l2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=0.5),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
        )
        return model(x)

In [8]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [9]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    state = torch.from_numpy(state).type(torch.FloatTensor)
    state = policy(Variable(state))
    c = Categorical(state)
    action = c.sample()

    # Add log probability of our chosen action to our history
    if policy.policy_history.dim() > 1:
        policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action)])
    else:
        policy.policy_history = (c.log_prob(action))
    return action

In [10]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.item())
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [11]:
def main(episodes):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    running_reward = 10
    for episode in range(episodes):
        state = env.reset() # Reset environment and record the starting state
        done = False       
    
        for time in range(1000):
            action = select_action(state).item()
#             print(action)
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action)

            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)

        update_policy()

        if episode % 50 == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(episode, time, running_reward))

        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [12]:
episodes = 1000
main(episodes)

Episode 0	Last length:    17	Average length: 10.07
Episode 50	Last length:    38	Average length: 24.51
Episode 100	Last length:    32	Average length: 24.89
Episode 150	Last length:    28	Average length: 21.98
Episode 200	Last length:    29	Average length: 19.95
Episode 250	Last length:     8	Average length: 18.83
Episode 300	Last length:    15	Average length: 17.44
Episode 350	Last length:     9	Average length: 17.14
Episode 400	Last length:    20	Average length: 16.48
Episode 450	Last length:    11	Average length: 15.24
Episode 500	Last length:    10	Average length: 14.68
Episode 550	Last length:    14	Average length: 14.55
Episode 600	Last length:     9	Average length: 13.81
Episode 650	Last length:     7	Average length: 13.27
Episode 700	Last length:    12	Average length: 13.21
Episode 750	Last length:    12	Average length: 12.80
Episode 800	Last length:    11	Average length: 12.79
Episode 850	Last length:    12	Average length: 13.16
Episode 900	Last length:    13	Average length: 13

In [13]:
state = env.reset()
goal_steps = 200
for t in range(goal_steps):
    env.render()
    action = select_action(state).item()
    observation, reward, done, info = env.step(action)
    if done:
        break