In [1]:
import gym
from torch import nn, optim
import numpy as np
import datetime as dt

In [2]:
from observers import (
    WindowMetricLogger,
    WindowStepMetricLogger,
    StateAnalysisLogger,
    TensorboardScalarLogger
)
from agents import (
    DQNAgent,
    EpsilonDecreasingStrategy
)
from training import (
    QLearningTrainer,
    QLearningContext,
    episode_value_accessor
)
from common import (
    Discretizer,
    Tensorboard
)

In [3]:
%load_ext tensorboard

In [4]:
env = gym.make('CartPole-v1')

  result = entry_point.load(False)


In [5]:
BATCH_SIZE=128

model = nn.Sequential(
    nn.Linear(env.observation_space.shape[0], 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, env.action_space.n)
)

agent = DQNAgent(
    env=env, 
    strategy=EpsilonDecreasingStrategy(
        initial_epsilon=1.0,
        min_epsilon=0.001,
        decay=0.02
    ),
    model=model, 
    optimizer=optim.Adam(model.parameters(), lr=0.001),
    loss=nn.MSELoss(), 
    discount=0.95,
    memory_size=20000,
    batch_size=BATCH_SIZE
)

In [None]:
class SimpleWindowLogger():
    
    def __init__(self, name, apply):
        self.name = name
        self.apply = apply
        
    def on_train_start(self, context):
        pass
        
    def on_step_end(self, context):
        pass
    
    def on_episode_start(self, context):
        pass
    
    def on_episode_end(self, context):
        print("Epoch {epoch} | {name}={value}".format(
            epoch=context.get_episode_value('epoch'),
            name=self.name,
            value=self.apply(context)
        ))

In [None]:
class TensorboardScalarLogger():
    
    def __init__(self, tb, name, apply):
        self.tb = tb
        self.name = name
        self.apply = apply
        
    def on_train_start(self, context):
        pass
        
    def on_step_end(self, context):
        pass
    
    def on_episode_start(self, context):
        pass
    
    def on_episode_end(self, context):
        value = self.apply(context)
        if value is not None:
            self.tb.log_scalar(
                self.name, 
                value,
                context.get_episode_value('epoch')
            )

In [None]:
env = gym.make('CartPole-v1')

In [None]:
BATCH_SIZE=128
# HIDDEN_DIM=24

model = nn.Sequential(
    nn.Linear(env.observation_space.shape[0], 512),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, env.action_space.n)
)

solver = DQNSolver(
    env=env, 
    model=model, 
    optimizer=optim.Adam(model.parameters(), lr=0.001),
    loss=nn.MSELoss(), 
    discount=0.95,
    memory_size=20000,
    batch_size=BATCH_SIZE
)
#         self.gamma = 0.95    # discount rate
#         self.epsilon = 1.0  # exploration rate
#         self.epsilon_min = 0.0001
#         self.epsilon_decay = 0.999
#         self.batch_size = 128

In [None]:
TENSORBOARD_LOGDIR = "./logs/cartpole-v0/2"

In [None]:
def episode_value(key):
    return lambda c: c.get_episode_value(key) 

def episode_value_mean(key):
    return lambda c: np.mean(c.get_episode_value(key)) if c.get_episode_value(key) is not None else None 

def episode_value_count(key):
    return lambda c: len(c.get_episode_value(key)) if c.get_episode_value(key) is not None else None 

def episode_value_sum(key):
    return lambda c: np.sum(c.get_episode_value(key)) if c.get_episode_value(key) is not None else None 

def train_observers():
    tb = Tensorboard(TENSORBOARD_LOGDIR + '/' + dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
    return [
        TensorboardScalarLogger(tb=tb, name='loss', apply=episode_value_mean('loss')),
        TensorboardScalarLogger(tb=tb, name='target', apply=episode_value_mean('target')),
        TensorboardScalarLogger(tb=tb, name='reward', apply=episode_value_sum('reward')),
        TensorboardScalarLogger(tb=tb, name='epsilon', apply=episode_value('epsilon')),
        TensorboardScalarLogger(tb=tb, name='action', apply=episode_value_mean('action')),
#         TensorboardScalarLogger(tb=tb, name='epsilon', apply=lambda c: c.get_episode_value('epsilon')),
#         TensorboardScalarLogger(tb=tb, name='target', apply=lambda c: np.mean(c.get_episode_value('target'))),
#         TensorboardScalarLogger(tb=tb, name='reward', apply=lambda c: np.sum(c.get_episode_value('target')))
#         SimpleWindowLogger(name='loss', apply=mean_episode_value('loss')),
#         SimpleWindowLogger(name='loss_count', apply=episode_value_count('loss'))
    ]

In [None]:
# 4. Put it all together
trainer = QLearningTrainer(
    env=env, 
    solver=solver,
    strategy=EpsilonDecreasingStrategy(
        initial_epsilon=1.0,
        min_epsilon=0.001,
        decay=0.02
    )
)

In [None]:
%tensorboard --logdir {TENSORBOARD_LOGDIR}

In [None]:
trainer.train(
    epochs=1000,
    observers=train_observers()
)

In [None]:
state = env.reset()

In [None]:
env.observation_space.low

In [None]:
print(state)

In [None]:
solver.forward(state)

In [None]:
solver.predict(state + [2.9, 0, 0, 0]).argmax()

In [None]:
plt.hist([s[0][0] for s in solver.memory])

In [None]:
plt.hist([s[0][2] for s in solver.memory])

In [None]:
plt.hist([s[0][1] for s in solver.memory])

In [None]:
plt.hist([s[0][3] for s in solver.memory])

In [None]:
state = env.reset()
env.render()

In [None]:
target = copy.deepcopy(predictions.detach())

In [None]:
for i, item in enumerate(batch):
    future_reward = solver.model.forward(torch.Tensor(item[3])).detach().max().item()
    target[i][item[1]] = item[2] + solver.discount * future_reward

In [None]:
# print(target, predictions)

In [None]:
model.forward(torch.Tensor([[1, 1, 1, 1], [2, 1, 1, 1]]))

In [None]:
state = env.reset()

In [None]:
print(state)

In [None]:
solver.predict(state)

In [None]:
state = np.reshape(state, [1, self.env.observation_space.shape[0]])

In [None]:
print(state)

In [None]:
solver.predict(state)

In [None]:
trainer.render()

In [None]:
def best_action(trainer, state):
    action = np.argmax(trainer.model.forward(torch.Tensor(state)).detach()).item()
    # print(action)
    return action

In [None]:
state = env.reset()

In [None]:
# Choose an action
action = best_action(solver, state)
print(action)

In [None]:
# Perform update step, don't forget to discretize
next_state, reward, done, _ = env.step(action)
print(next_state, reward, done)

In [None]:
solver.optimizer.zero_grad()

In [None]:
out = solver.model.forward(torch.Tensor(state))
print(out)

In [None]:
future_reward = solver.model.forward(torch.Tensor(next_state)).detach().max().item()
print(future_reward)

In [None]:
target = copy.deepcopy(out.detach())
print(target)

In [None]:
target[action] = reward + solver.discount * future_reward
print(target)

In [None]:
loss = solver.loss(out, target.detach())
print(loss)

In [None]:
loss.backward()
solver.optimizer.step()

In [None]:
solver.model.forward(torch.Tensor(state))

In [None]:
# Target is the current Q-Value with new reward for action taken

future_reward = self.model.forward(torch.Tensor(next_state)).detach().max().item()
target = copy.deepcopy(out.detach())
target[action] = reward + self.discount * future_reward

# Move towards target with one backward pass
loss = self.loss(out, target.detach())
print(loss)
loss.backward()
self.optimizer.step()

In [None]:
import torch

In [None]:
trainer.render()

In [None]:
np.argmax(model.forward(torch.Tensor(env.reset())).detach()).item()

In [None]:
import numpy as np

In [None]:
import torch

In [None]:
model.forward(torch.Tensor(env.reset())).detach().max().item()

In [None]:
np.max(model.forward(torch.Tensor(env.reset())).detach())

In [None]:
model.forward(torch.Tensor(env.reset())).detach()[1]

In [None]:
state = env.reset()

In [None]:
actions = model.forward(torch.Tensor(state))

In [None]:
actions

In [None]:
model.forward(torch.Tensor(env.reset())).detach()

In [None]:
np.argmax(torch.Tensor([1,2])).item()

In [None]:
import tensorflow as tf
import datetime, os
import tensorboard

In [None]:
def create_model():
    return tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

def train_model():
  
    model = create_model()
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    logdir = os.path.join(logs_base_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

    model.fit(
        x=x_train, 
        y=y_train, 
        epochs=5, 
        validation_data=(x_test, y_test), 
        callbacks=[tensorboard_callback]
    )

train_model()

In [None]:
from common import Tensorboard
import tensorflow as tf

In [None]:
# tb = Tensorboard('./logs')
tb.log_scalar('loss', 0.92, 5)

In [None]:
tf.compat.v1.summary.FileWriter('./logs')

In [None]:
tf.contrib.summary.FileWriter('./logs')

In [None]:
writer = tf.summary.create_file_writer('./logs')


In [None]:
from statistics import mean
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from collections import deque
import os
import csv
import numpy as np

SCORES_CSV_PATH = "./scores/scores.csv"
SCORES_PNG_PATH = "./scores/scores.png"
SOLVED_CSV_PATH = "./scores/solved.csv"
SOLVED_PNG_PATH = "./scores/solved.png"
AVERAGE_SCORE_TO_SOLVE = 195
CONSECUTIVE_RUNS_TO_SOLVE = 100


class ScoreLogger:

    def __init__(self, env_name):
        self.scores = deque(maxlen=CONSECUTIVE_RUNS_TO_SOLVE)
        self.env_name = env_name

        if os.path.exists(SCORES_PNG_PATH):
            os.remove(SCORES_PNG_PATH)
        if os.path.exists(SCORES_CSV_PATH):
            os.remove(SCORES_CSV_PATH)

    def add_score(self, score, run):
        self._save_csv(SCORES_CSV_PATH, score)
        self._save_png(input_path=SCORES_CSV_PATH,
                       output_path=SCORES_PNG_PATH,
                       x_label="runs",
                       y_label="scores",
                       average_of_n_last=CONSECUTIVE_RUNS_TO_SOLVE,
                       show_goal=True,
                       show_trend=True,
                       show_legend=True)
        self.scores.append(score)
        mean_score = mean(self.scores)
        print("Scores: (min: " + str(min(self.scores)) + ", avg: " + str(mean_score) + ", max: " + str(max(self.scores)) + ")\n")
        if mean_score >= AVERAGE_SCORE_TO_SOLVE and len(self.scores) >= CONSECUTIVE_RUNS_TO_SOLVE:
            solve_score = run-CONSECUTIVE_RUNS_TO_SOLVE
            print("Solved in " + str(solve_score) + " runs, " + str(run) + " total runs.")
            self._save_csv(SOLVED_CSV_PATH, solve_score)
            self._save_png(input_path=SOLVED_CSV_PATH,
                           output_path=SOLVED_PNG_PATH,
                           x_label="trials",
                           y_label="steps before solve",
                           average_of_n_last=None,
                           show_goal=False,
                           show_trend=False,
                           show_legend=False)
            exit()

    def _save_png(self, input_path, output_path, x_label, y_label, average_of_n_last, show_goal, show_trend, show_legend):
        x = []
        y = []
        with open(input_path, "r") as scores:
            reader = csv.reader(scores)
            data = list(reader)
            for i in range(0, len(data)):
                x.append(int(i))
                y.append(int(data[i][0]))

        plt.subplots()
        plt.plot(x, y, label="score per run")

        average_range = average_of_n_last if average_of_n_last is not None else len(x)
        plt.plot(x[-average_range:], [np.mean(y[-average_range:])] * len(y[-average_range:]), linestyle="--", label="last " + str(average_range) + " runs average")

        if show_goal:
            plt.plot(x, [AVERAGE_SCORE_TO_SOLVE] * len(x), linestyle=":", label=str(AVERAGE_SCORE_TO_SOLVE) + " score average goal")

        if show_trend and len(x) > 1:
            trend_x = x[1:]
            z = np.polyfit(np.array(trend_x), np.array(y[1:]), 1)
            p = np.poly1d(z)
            plt.plot(trend_x, p(trend_x), linestyle="-.",  label="trend")

        plt.title(self.env_name)
        plt.xlabel(x_label)
        plt.ylabel(y_label)

        if show_legend:
            plt.legend(loc="upper left")

        plt.savefig(output_path, bbox_inches="tight")
        plt.close()

    def _save_csv(self, path, score):
        if not os.path.exists(path):
            with open(path, "w"):
                pass
        scores_file = open(path, "a")
        with scores_file:
            writer = csv.writer(scores_file)
            writer.writerow([score])

In [None]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


# from scores.score_logger import ScoreLogger

ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    print('xx')
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            print('step')
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()


cartpole()

In [None]:
score_logger

In [None]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Reshape, Dropout
from tensorflow.keras.optimizers import Adam


# Neural Network model for Deep Q Learning
def OurModel(input_shape, action_space):
    X_input = Input(input_shape)
    X = X_input

    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 512 nodes
    X = Dense(512, input_shape=input_shape, activation="relu")(X)
    X = Dropout(0.5)(X)

    # Hidden layer with 256 nodes
    X = Dense(256, activation="relu")(X)
    X = Dropout(0.5)(X)
    
    # Hidden layer with 64 nodes
    X = Dense(64, activation="relu")(X)
    X = Dropout(0.5)(X)
    
    # Output Layer with # of actions: 2 nodes (left, right)
    X = Dense(action_space, activation="linear")(X)

    model = Model(inputs = X_input, outputs = X, name='CartPoleModel')
    model.compile(loss='mse', optimizer=Adam())
    
    return model

class DQNAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.EPISODES = 1000
        self.memory = deque(maxlen=2000)
        
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.0001
        self.epsilon_decay = 0.999
        self.batch_size = 128

        self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state))

    def replay(self):
        x_batch, y_batch = [], []
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
        for state, action, reward, next_state, done in minibatch:
            # make the agent to approximately map the current state to future discounted reward
            # We'll call that y_target
            y_target = self.model.predict(state)
            # if done, make our target reward
            if done:
                y_target[0][action] = reward
            else:
                # predict the future discounted reward
                y_target[0][action] = reward + self.gamma * np.max(self.model.predict(next_state)[0])
            # append results to lists, that will be used for training
            x_batch.append(state[0])
            y_batch.append(y_target[0])

        # Train the Neural Network with batches
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self, name):
        self.model = load_model(name)

    def save(self, name):
        self.model.save(name)

    def run(self):
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done:
                    reward = reward
                else:
                    reward = -10
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1
                if done:
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
                    if i == 500:
                        print("Saving trained model as cartpole-dqn.h5")
                        self.save("cartpole-dqn.h5")
                    break
                self.replay()

    def test(self):
        self.load("cartpole-dqn.h5")
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                state = np.reshape(next_state, [1, self.state_size])
                i += 1
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
                    break

# if __name__ == "__main__":
agent = DQNAgent()
agent.run()
    #agent.test()
