In [0]:
import gym
import numpy as np
from gym.envs.registration import register
import math
import tensorflow as tf

In [0]:
tf.enable_eager_execution()

## 카트폴 게임

In [0]:
#installing dependencies
!apt-get -qq -y install libnvtoolsext1 > /dev/null
!ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so
!apt-get -qq -y install xvfb freeglut3-dev ffmpeg> /dev/null
!pip -q install gym
!pip -q install pyglet
!pip -q install pyopengl
!pip -q install pyvirtualdisplay

In [0]:
import gym
env = gym.make("CartPole-v0")

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()

In [0]:
np.set_printoptions(threshold=np.inf)

## weight 모델 생성

In [0]:
w1 = tf.Variable(tf.random_uniform([4, 64], 0, 0.1))
w1

In [0]:
w2 = tf.Variable(tf.random_uniform([64, 32], 0, 0.1))
w2

In [0]:
w3 = tf.Variable(tf.random_uniform([32, 2], 0, 0.1))
w3

## NN을 활용한 카트폴 게임 학습


In [0]:
dis = 1   # cartpole은 길면 길 수록 좋은 것이기 때문에 reward를 계속 증가시켜주는 것
optimizer = tf.train.AdamOptimizer(0.001)
frameList = []
for episode in range(5000):
    observation = env.reset()
    total_frame = 0

    e = 1.0 / ((episode// 100) + 1)
    for frame in range(200):
        with tf.GradientTape() as tape:
            observation = np.array(np.reshape(observation, [1, 4]), dtype="float32")

            hypo1 = tf.matmul(observation, w1)
            hypo2 = tf.matmul(hypo1, w2)
            predQ = tf.matmul(hypo2, w3)

            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                action = np.argmax(predQ)

            next_observation, reward, done, info = env.step(action)

            next_observation = np.array(np.reshape(next_observation, [1, 4]), dtype="float32")

            Qs = predQ.numpy()
            if done:
                Qs[0, action] = -100

                Qs = tf.convert_to_tensor(Qs, np.float32)

                frameList.append([episode, frame])

            else:
                hypo1 = tf.matmul(next_observation, w1)
                hypo2 = tf.matmul(hypo1, w2)
                nextQ = tf.matmul(hypo2, w3)

                Qs[0, action] = reward + dis*np.max(nextQ)

                Qs = tf.convert_to_tensor(Qs, np.float32)

            cost = tf.reduce_sum(tf.square(Qs- predQ))

            grads = tape.gradient(cost, [w1, w2, w3])

            optimizer.apply_gradients(grads_and_vars = zip(grads, [w1, w2, w3]))

            observation = next_observation
        total_frame = frame
        if done == True:
            break

    print(f"episode : {episode}, total_frame : {total_frame}")

## 학습이 잘 안된다 그 이유는 입력값들이 독립 변수의 형태가 아니라서 그렇다
## 이 게임에 대해서 좋은 성능을 내기 위해서는 더 깊은 모델이 필요하다

In [0]:
observation = env.reset()
animationFrame = []
for frame in range(200):
    animationFrame.append(
        env.render(mode = 'rgb_array')
    )
    
    observation = np.array(np.reshape(observation, [1, 4]), dtype="float32")

    hypo1 = tf.matmul(observation, w1)
    hypo2 = tf.matmul(hypo1, w2)
    predQ = tf.matmul(hypo2, w3)

    if np.random.rand(1) < e:
        action = env.action_space.sample()
    else:
        action = np.argmax(predQ)

    next_observation, reward, done, info = env.step(action)

    next_observation = np.array(np.reshape(next_observation, [1, 4]), dtype="float32")

    observation = next_observation

    if done == True:
        break


In [0]:
import matplotlib.pyplot as plt
import matplotlib.animation
import numpy as np
from IPython.display import HTML

## 카트폴 게임 이미지를 저장하고, 보여주기

In [0]:
def animate(index):
    patch.set_data(animationFrame[index])

In [0]:
len(animationFrame)

In [0]:
plt.figure(figsize=(8, 5))
patch = plt.imshow(animationFrame[0])
ani = matplotlib.animation.FuncAnimation(plt.gcf(), 
                                         animate, 
                                         frames=len(animationFrame),
                                         interval = 50)

HTML(ani.to_jshtml())