In [17]:
import random
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

alpha = 0.001
gamma = 0.99

    
class PolicyNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fcA1 = torch.nn.Linear(8, 64)
        self.fcA2 = torch.nn.Linear(64, 64)
        self.fcA3 = torch.nn.Linear(64, 4)
        
    def forward(self, x):
        x = self.fcA1(x)
        x = torch.nn.functional.relu(x)
        x = self.fcA2(x)
        x = torch.nn.functional.relu(x)
        x = self.fcA3(x)  
        x = torch.nn.functional.softmax(x, dim=-1)
        return x
    
# network and optimizer
pi = PolicyNetwork().to(device)
pi_optimizer = torch.optim.Adam(pi.parameters(), lr=alpha)



def gen_episode():
    states = []
    actions = []
    rewards = []
    state = env.reset() 
    done = False
    score = 0
    while not done:
        probs = pi(torch.FloatTensor(state).to(device))
        action = torch.multinomial(probs, 1).item()
        next_state, reward, done, info = env.step(action)  # take a random action
        if done:
            env.close()
        score = score + reward
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)    
        state = next_state
    return states, actions, rewards, score

def G(t):
    G = 0
    for tau in range(t, len(rewards)):
        G = G + gamma**(tau - t) * rewards[tau]
    return G
        
env = gym.make('LunarLander-v2')
env = gym.wrappers.TimeLimit(env, max_episode_steps = 300)
episode = 0
MAX_EPISODES = 10000
running_score = 0
while episode < MAX_EPISODES:  # episode loop
    states, actions, rewards, score = gen_episode()
    loss = 0
    for t in range(0, len(rewards)-1):
        s = states[t+1]
        a = actions[t+1]
        s=torch.FloatTensor(s).to(device)
        a=torch.tensor(a, dtype=torch.int8).to(device)
        loss = loss - G(t) * (gamma**t)* pi(s)[a].log()
   
    pi_optimizer.zero_grad()
    loss.backward()
    pi_optimizer.step()   
    running_score = running_score + score
    
    if episode % 20 == 0:
        print('Episode {} \t reward: {}'.format(episode, int(running_score)/20))
        running_score = 0

    
    episode = episode + 1
    



Episode 0 	 reward: -3.5
Episode 20 	 reward: -202.4
Episode 40 	 reward: -177.5
Episode 60 	 reward: -165.75
Episode 80 	 reward: -175.05
Episode 100 	 reward: -189.6
Episode 120 	 reward: -190.55
Episode 140 	 reward: -175.35
Episode 160 	 reward: -202.75
Episode 180 	 reward: -125.3
Episode 200 	 reward: -151.9
Episode 220 	 reward: -126.65
Episode 240 	 reward: -156.8
Episode 260 	 reward: -137.35
Episode 280 	 reward: -127.75
Episode 300 	 reward: -138.95
Episode 320 	 reward: -146.55
Episode 340 	 reward: -137.25
Episode 360 	 reward: -144.0
Episode 380 	 reward: -149.55
Episode 400 	 reward: -175.0
Episode 420 	 reward: -139.75
Episode 440 	 reward: -115.0
Episode 460 	 reward: -133.95
Episode 480 	 reward: -140.2
Episode 500 	 reward: -115.65
Episode 520 	 reward: -143.65
Episode 540 	 reward: -152.75
Episode 560 	 reward: -123.55
Episode 580 	 reward: -133.9
Episode 600 	 reward: -147.5
Episode 620 	 reward: -150.55
Episode 640 	 reward: -139.55
Episode 660 	 reward: -116.1
Ep

Episode 5680 	 reward: 92.1
Episode 5700 	 reward: 96.55
Episode 5720 	 reward: 104.0
Episode 5740 	 reward: 110.25
Episode 5760 	 reward: 131.25
Episode 5780 	 reward: 112.8
Episode 5800 	 reward: 97.3
Episode 5820 	 reward: 81.5
Episode 5840 	 reward: 85.45
Episode 5860 	 reward: 71.6
Episode 5880 	 reward: 61.3
Episode 5900 	 reward: 27.05
Episode 5920 	 reward: 64.0
Episode 5940 	 reward: 65.8
Episode 5960 	 reward: 58.8
Episode 5980 	 reward: 68.45
Episode 6000 	 reward: 71.6
Episode 6020 	 reward: 66.35
Episode 6040 	 reward: 58.0
Episode 6060 	 reward: 111.75
Episode 6080 	 reward: 106.15
Episode 6100 	 reward: 64.25
Episode 6120 	 reward: 79.3
Episode 6140 	 reward: 89.15
Episode 6160 	 reward: 91.15
Episode 6180 	 reward: 51.35
Episode 6200 	 reward: 85.8
Episode 6220 	 reward: 68.2
Episode 6240 	 reward: 77.6
Episode 6260 	 reward: 53.85
Episode 6280 	 reward: 41.0
Episode 6300 	 reward: 48.85
Episode 6320 	 reward: 58.6
Episode 6340 	 reward: 68.9
Episode 6360 	 reward: 72.4

In [18]:
# TEST   
episode = 0
state = env.reset()    
while episode < 5:  # episode loop
    env.render()
    probs = pi(torch.FloatTensor(state).to(device))     
    action = torch.multinomial(probs, 1).item()
    next_state, reward, done, info = env.step(action)  # take a random action
    state = next_state
    sleep(0.01)
    if done:
        env.close()
    score = score + reward
    if done:
        episode = episode + 1
        print('Episode: {} Score: {}'.format(episode, score))
        state = env.reset()
env.close()       


Episode: 1 Score: 186.15388134417833
Episode: 2 Score: 286.5491578693419
Episode: 3 Score: 393.12455465314804
Episode: 4 Score: 485.4652217974341
Episode: 5 Score: 575.2838679020795
