In [None]:
from unityagents import UnityEnvironment
env = UnityEnvironment(file_name="./Banana_Linux/Banana.x86_64")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environmentll
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)


### Deep Q-Learning Implementation

In [None]:
from dqn_agent import Agent
from collections import deque
import torch
import numpy as np
 
# Hyperparameters
BUFFER_SIZE = 10000  # replay buffer size
BATCH_SIZE = 64       # minibatch size
GAMMA = 0.99          # discount factor
TAU = 1e-3           # for soft update of target parameters
LR = 1e-4            # learning rate    
UPDATE_EVERY = 1     # how often to update the network

# Initialize agent
seed = 0
agent = Agent(state_size, action_size, BUFFER_SIZE, BATCH_SIZE, LR, GAMMA, TAU, UPDATE_EVERY, seed)

# Initialize other parameters
T = 1000                            # maximum number of time steps per episode
scores = []                        # list containing scores from each episode
scores_window = deque(maxlen=100)  # last 100 scores
mean_scores = []                # list containing mean scores from each episode
eps = 1.0                          # initialize epsilon
eps_end = 0.01                    # minimum epsilon
eps_decay = 0.995                 # decay rate for epsilon

num_episodes = 2000

# Loop over episodes
for i_episode in range(1, num_episodes+1):
    env_info = env.reset(train_mode=True)[brain_name] # reset the environment
    state = env_info.vector_observations[0]
    score = 0 
    for t in range(T):
        # Select action a_j using epsilon-greedy policy
        action = agent.act(state, eps)
        
        # Execute action and observe reward and next state
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]

        # Store experience in replay memory D, learn if enough samples are available
        agent.step(state, action, reward, next_state, done)

        state = next_state
        score += reward
        if done:
            break 

    # Save the score and update the scores window
    scores_window.append(score)       # save most recent score
    scores.append(score)              # save most recent score
    mean_scores.append(np.mean(scores_window))  # save mean score
    
    # Update epsilon
    eps = max(eps_end, eps_decay*eps)

    print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}', end="")
    if i_episode % 10 == 0:
        print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}')

    if np.mean(scores_window) >= 13.0:  # Check if average score is above threshold
        print(f'\nEnvironment solved in {i_episode} episodes!\tAverage Score: {np.mean(scores_window):.2f}')
        torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        break


In [None]:
env.close()

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(211)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
fig.add_subplot(212)
plt.plot(np.arange(len(mean_scores)), mean_scores)
plt.ylabel('Mean Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# Save the scores and parameters to a JSON file
data = {
    'BUFFER_SIZE': BUFFER_SIZE,
    'BATCH_SIZE': BATCH_SIZE,
    'GAMMA': GAMMA,
    'TAU': TAU,
    'LR': LR,
    'scores': scores,
    'mean_scores': mean_scores
}

with open('training_scores.json', 'w') as f:
    json.dump(data, f)