# Cart Pendulum environment

In [66]:
from cartPendulum import cartPendulum
import numpy as np
env = cartPendulum()
env.m_c = 10
env.m_p = 1
env.step_size = 0.05

def states():
    for x1 in np.arange(-1, 1, 0.1):
        for x2 in np.arange(-7, 7, 0.1):
            for x3 in np.arange(-2, 2, 0.1):
                for x4 in np.arange(-2, 2, 0.1):
                    yield (x1, x2, x3, x4)
                    
def actions():
    return [-10, -5, 0, 5, 10]

def state(self):
    return lambda : [int(xi*100) for xi in self.x]
                    
env.states = states
env.actions = actions
env.state = state(env)

# Pendulum environment

In [7]:
from pendulum import pendulum
import numpy as np
env = pendulum()
env.step_size = 0.1

r = lambda theta : ((theta/np.pi - 1) % 2)*np.pi - np.pi
env.actions = lambda: [0, -4, 4]
env.reward = lambda:  np.max([10 - (100*np.abs(env.x[0])), 0])
env.state = lambda: [int(np.round(r(env.x[0])*100)), int(np.round(env.x[1]*10))]

env.step(100)
env.state()
env.reward()

0.0

In [1]:
import copy
import numpy as np
def value_iteration(mdp, epsilon, gamma):
    V = dict()
    # Initialize utilities to zero
    for s in env.states():
        V.update({s: 0.0})
    while(True):
        V_prev = copy.deepcopy(V)
        delta = 0
        for s in mdp.states():
            lst = []
            for a in mdp.actions():
                s_next = env.transition(s, a)
                lst = V_prev[(int(s_next[0]*10//1), int(s_next[1]*10//1), int(s_next[2]*10//1), int(s_next[3]*10//1))]
            V[s] = mdp.reward(s) + gamma*np.max(lst)
            if np.abs(V[s] - V_prev[s]) > delta:
                delta = np.abs(V[s] - V_prev[s])
        print(delta)
        if delta < epsilon*(1 - gamma)/gamma:
            return V

In [117]:
#V = value_iteration(env, 0.1, 0.1)
#for s in env.states():
#    V.update({s: 0.0})
#for i in env.states()

TypeError: 'generator' object is not subscriptable

In [15]:
import numpy as np

q_default = 0

def Q_Learning(env, gamma, alpha, epsilon, Q = dict()):
    # Initialize environment
    theta = (2*np.random.rand()-1)*0.01
    s = env.init([theta, 0])
    while(np.abs(s[0]) < 700 and np.abs(s[1]) < 70):
        # Select action (epsilon greedy)
        if np.random.rand() < epsilon:
            a = env.actions()[np.argmax([Q.get((*s, action), q_default) for action in env.actions()])]
        else:
            a = np.random.choice(env.actions())
        
        # Perform action and observe reward
        env.step(a)
        r = env.reward()
        s_next = env.state()
        
        # Q update
        lst = [Q.get((*s_next, action), q_default) for action in env.actions()]
        Q[(*s, a)] = Q.get((*s, a), q_default) + alpha*(r + gamma*np.max(lst) - Q.get((*s, a), q_default))
        
        s = s_next
    return Q

In [16]:
#Q = dict()
NUM_EPISODES = 10000
for e in range(NUM_EPISODES):
    #epsilon = 0.95*e/NUM_EPISODES #Decaying random coice of action
    epsilon = 0.99
    Q = Q_Learning(env , gamma = 0.9 , alpha = 0.5, epsilon = epsilon, Q = Q)
    print("Episode: {}, Epsilon: {:1.4f}, Q size: {}".format(e, epsilon, len(Q)))

Episode: 0, Epsilon: 0.9900, Q size: 225481
Episode: 1, Epsilon: 0.9900, Q size: 225482
Episode: 2, Epsilon: 0.9900, Q size: 225482
Episode: 3, Epsilon: 0.9900, Q size: 225482
Episode: 4, Epsilon: 0.9900, Q size: 225482
Episode: 5, Epsilon: 0.9900, Q size: 225482
Episode: 6, Epsilon: 0.9900, Q size: 225482
Episode: 7, Epsilon: 0.9900, Q size: 225482
Episode: 8, Epsilon: 0.9900, Q size: 225482


KeyboardInterrupt: 

In [26]:
from matplotlib import pyplot as plt
import numpy as np
%matplotlib auto

env.render()


env.init([np.pi, 0])
sampels = 100
y = np.zeros((sampels, 2))
q = np.zeros((sampels, 1))
y[0, :] = env.x


for t in range(sampels):
    s = env.state()
    u = env.actions()[np.argmax([Q.get((*s, action), -100) for action in env.actions()])]
    env.step(u)
    y[t, :] = env.state()
    q[t] = Q.get((*s, u), -1)
    plt.pause(0.01)
    #if t%10 == 0:
    #    env.render()
    
t = np.linspace(0, env.step_size*(sampels-1), sampels)

Using matplotlib backend: Qt5Agg




In [27]:
len(Q)

225261

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.plot(t,y)
#plt.plot(t, q*10)
plt.xlabel('Time [sec]')
#plt.legend(['x_1','x_2', 'x_3', 'x_4'])
plt.show()

In [None]:
m = 0
m_k = 0
for k in Q.keys():
    if Q[k] > m:
        m = Q[k]
        m_k = k
m_k

In [2]:
env.init([0.1, 0])
env.step(-5)
#env.step(-2)
#env.step(2)

env.state()
#env.reward()

NameError: name 'env' is not defined

In [19]:
import pickle

#pickle.dump( Q, open( "q_vals.p", "wb" ) )
Q = pickle.load(open( "q_vals.p", "rb" ))