# Imports

In [1]:
import gym
import wandb
import numpy as np
import pandas as pd

In [3]:
from simplenn.optim.qlearning import ReplayMemory
from simplenn.optim.qlearning import QLearning
from simplenn.optim.qlearning import DoubleQLearning
from simplenn.optim.qlearning import FixedTargetQLearning
from simplenn.optim.qlearning import FixedTargetDoubleQLearningSymmetric
from simplenn.optim.qlearning import FixedTargetDoubleQLearningAsymmetric
from simplenn.optim.qlearning import BaseSimulation

from simplenn.structures.qfunction import QTable
from simplenn.structures.qfunction import DiscretizedQTable
from simplenn.structures.qfunction import AutoDiscretizingQTable
from simplenn.structures.qfunction import AggregateQFunction

In [4]:
#env = gym.make('MountainCar-v0')
#env = gym.make('Acrobot-v1')
#env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')

In [5]:
%matplotlib tk

# Performing experiment

In [15]:
# Config dict
config = {}

# Basic parameters
config["gamma"] = 0.95
config["defaultVal"] = 0.0

# Learning rate related parameters
config["alphaRate"] = 0.75
config["alphaCutoff"] = 0.0

# Exploration related parameters
config["epsInit"] = 1.0
config["epsFinal"] = 0.2
config["epsRampLen"] = 10

# Auto-discretization related parameters
config["maxObsPerLeaf"] = 16
config["maxLeafNodes"] = 30*10**3
config["splitResetMode"] = AutoDiscretizingQTable.MODE_0

# Experience replay related parameters
config["replayMemMaxSize"] = 100 * env.env.spec.max_episode_steps
config["replayMemMinSize"] = 100 * env.env.spec.max_episode_steps
config["replayMemMode"] = ReplayMemory.MODE_CYCLING

# Fixed Q-target related parameters
config["targetQUpdateFreq"] = 25 * env.env.spec.max_episode_steps

# Simulation duration
config["nEpisodes"] = 50000

# Evaluation related parameters
config["nTest"] = 250
config["verboseFreq"] = 5000
config["recordFreq"] = 5000

# Wandb only parameters
config["envId"] = env.env.spec.id

In [16]:
nCartPos = 50
nCartV = 100
nPoleAngle = 100
nPoleV = 100

obs_space = env.observation_space

artMaxCartV = 5.0
artMinCartV = -artMaxCartV
artMaxPoleV = 10.0
artMinPoleV = -artMaxPoleV

qCartPos = (obs_space.high[0] - obs_space.low[0]) / nCartPos
qCartV = (artMaxCartV - artMinCartV) / nCartV
qPoleAngle = (obs_space.high[2] - obs_space.low[2]) / nPoleAngle
qPoleV = (artMaxPoleV - artMinPoleV) / nPoleV

config["quantums"] = [qCartPos, qCartV, qPoleAngle, qPoleV]

In [17]:
def doLearning(config):
    
#    Q = AutoDiscretizingQTable(
#        range(env.action_space.n),
#        config["gamma"],
#        config["alphaRate"],
#        config["alphaCutoff"],
#        config["epsInit"],
#        config["epsFinal"],
#        config["epsRampLen"],
#        config["defaultVal"],
#        config["maxObsPerLeaf"],
#        config["maxLeafNodes"],
#        config["splitResetMode"]
#    )
    Q = DiscretizedQTable(
        range(env.action_space.n),
        config["gamma"],
        config["alphaRate"],
        config["alphaCutoff"],
        config["epsInit"],
        config["epsFinal"],
        config["epsRampLen"],
        config["defaultVal"],
        config["quantums"]
    )
    config["qfunctionClass"] = Q.__class__.__name__
    
    replayMem = ReplayMemory(
        config["replayMemMaxSize"], 
        config["replayMemMinSize"], 
        config["replayMemMode"]
    )
    
#    algo = QLearning(Q, replayMem)
    algo = DoubleQLearning(Q, replayMem)
#    algo = FixedTargetQLearning(Q, replayMem, config["targetQUpdateFreq"])
#    algo = FixedTargetDoubleQLearningSymmetric(Q, replayMem, config["targetQUpdateFreq"])
#    algo = FixedTargetDoubleQLearningAsymmetric(Q, replayMem, config["targetQUpdateFreq"])
    config["algoClass"] = algo.__class__.__name__
    
    wandb.init(project="simple_rl", config=config)
    
    algo.learn(
        env, 
        config["nEpisodes"], 
        config["nTest"], 
        config["verboseFreq"], 
        config["recordFreq"], 
        wandb
    )
    
    return (Q, algo)

In [None]:
nRuns = 3

for i in range(nRuns):
    
    print(f"{i+1}/{nRuns}")
    Q, algo = doLearning(config)

# Analyzing the results

### Animation

In [10]:
print(f"Score: {algo.performEpisode(env, render=True)}")

Score: 381.0


### Score distribution

In [11]:
pd.Series([algo.performEpisode(env) for _ in range(config["nTest"])]).describe()

count    250.000000
mean     401.300000
std       36.084267
min      280.000000
25%      376.250000
50%      396.500000
75%      422.000000
max      500.000000
dtype: float64

### Score and nStates evolution

In [12]:
def getActualQ(Q):
    return Q if not isinstance(Q, AggregateQFunction) else Q.Qs[0]

scores = [np.mean([BaseSimulation(q).performEpisode(env) for _ in range(config["nTest"])]) for q in algo.qs]
nStates = [len(getActualQ(q).data) for q in algo.qs]
    
df = pd.DataFrame({
    "Score":pd.Series(scores),
    "nStates":pd.Series(nStates)
})

ax = df.plot(y="Score")
df.plot(y="nStates", secondary_y=True, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7aa7c11090>

### Q-Values distribution (Simple Q-Learning only)

In [13]:
qs = []

for s, sData in getActualQ(algo.Q).data.items():
    qs += [aData[Q.Q_IDX] for a, aData in sData.items()]

pd.Series(qs).hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7af442b590>

### Actions visit count distribution (Simple Q-Learning only)

In [14]:
ns = []

for s, sData in getActualQ(algo.Q).data.items():
    ns += [aData[Q.N_IDX] for a, aData in sData.items()]

pd.Series(ns).hist(bins=250)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7af4080710>

### Discrepancies between Qa and Qb (Double Q-Learning only)

In [74]:
# Does not work on auto-dicretized QTables.

actions = set()

Qa = algo.qs[-1].Qs[0]
Qb = algo.qs[-1].Qs[1]

for s, sData in Qa.data.items():
    for a, aData in sData.items():
        actions.add((s, a))

for s, sData in Qb.data.items():
    for a, aData in sData.items():
        actions.add((s, a))

nDiff = []
qDiff = []

for s, a in actions:
    
    na = Qa.getData(s, a).get(Qa.N_IDX, 0)
    nb = Qb.getData(s, a).get(Qb.N_IDX, 0)
    nMid = (na + nb) / 2.0
    
    qa = Qa.getData(s, a).get(Qa.Q_IDX, 0.0)
    qb = Qb.getData(s, a).get(Qb.Q_IDX, 0.0)
    qMid = (qa + qb) / 2.0
    
    if na > 5 and nb > 5:
        nDiff.append(abs(nb - nMid) / nMid)
    if na > 0 and nb > 0:
        qDiff.append(abs(qb - qMid) / qMid)

pd.Series(nDiff).hist(bins=100, density=True, cumulative=True)
#pd.Series(qDiff).hist(bins=100, density=True, cumulative=True)

TypeError: 'int' object is not subscriptable

# Administrative tasks

In [15]:
api = wandb.Api()
runs = api.runs("jrmsayag/simple_rl")

In [17]:
for run in runs:
    run.config["alphaCutoff"] = 0.0
    run.config["algoClass"] = QLearning.__name__
    run.config["qfunctionClass"] = DiscretizedQTable.__name__
    run.tags.clear()
    run.update()