In [0]:
import gym
from gym.envs.registration import register
import numpy as np
import random as pr

## 랜덤 프로즌레이크 게임

In [0]:
register(
    id="FrozenLake-v1",
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={
        "map_name":"4x4",
        "is_slippery":False
    }
)

In [0]:
env = gym.make("FrozenLake-v1")
env.render()

In [0]:
import tensorflow as tf
tf.enable_eager_execution()

## NN을 위한 weight 값 설정

In [0]:
w = tf.Variable(tf.random_uniform([16, 4], 0, 0.1))
w

In [0]:
optimizer = tf.train.AdamOptimizer(0.001)

In [0]:
def one_hot(num):
    onehot_arr = np.identity(16)[num:num+1]
    return np.array(onehot_arr, dtype="float32")

## 텐서플로우와 Q 함수를 활용한 랜덤 Frozen Lake 학습

In [0]:
Q = np.zeros([16, 4])
discount = 0.99
successList = []
for index in range(2000):
    position = env.reset()
    total_reward = 0
    e = 1.0/((index // 100) +  1)
    while True:
        
        with tf.GradientTape() as tape:
        
            position_onehot = one_hot(position)

            predQ = tf.matmul(position_onehot, w)

            if np.random.rand(1) < e:
                action = env.action_space.sample()
            else:
                action = np.argmax(predQ)

            new_position, reward, done, info = env.step(action)

            # print(f"new_position : {new_position}, reward : {reward}, done : {done}, info : {info}")
            Qs = predQ.numpy()
            if done == True:
                
                Qs[0, action] = reward
                Qs = tf.convert_to_tensor(Qs, np.float32)
                successList.append([index, reward])
            else:
                new_position_onehot = one_hot(new_position)
                nextQ = tf.matmul(new_position_onehot, w)

                Qs[0, action] = reward + discount * np.max(nextQ)

                Qs = tf.convert_to_tensor(Qs, np.float32)
            cost = tf.reduce_sum(tf.square(Qs - predQ))

            grads = tape.gradient(cost, [w])

            optimizer.apply_gradients(grads_and_vars=zip(grads, [w]))
            position = new_position

            if done == True:
                break


In [0]:
Qs

## 학습된 결과 중 목적지에 도착한 비율 출력

In [0]:
import matplotlib.pyplot as plt
result_list = list(zip(*successList))
print(sum(result_list[1]))
plt.bar(range(len(successList)), result_list[1], color="blue")

## 학습된 모델을 가지고 게임 재 수행

In [0]:
position = env.reset()

while True:
    env.render()

    position_onehot = one_hot(position)

    predQ=tf.matmul(position_onehot, w)

    action = np.argmax(predQ)

    new_position, reward, done, info = env.step(action)

    position = new_position

    if done == True:
        break
        