In [0]:
import gym
from gym.envs.registration import register
import numpy as np
import random as pr

## 랜덤 프로즌레이크 게임

In [0]:
register(
    id="FrozenLake-v1",
    entry_point="gym.envs.toy_text:FrozenLakeEnv",
    kwargs={
        "map_name":"4x4",
        "is_slippery":True
    }
)

In [0]:
env = gym.make("FrozenLake-v1")
env.render()

In [0]:
position= env.reset()
position

## Q 함수를 활용한 랜덤 Frozen Lake 학습

In [0]:
Q = np.zeros([16, 4])
discount = 0.99
total_reward_list = []
for index in range(2000):
    print(f"index : ", index)
    position = env.reset()
    total_reward = 0
    while True:
        env.render()
        print()
        e = 1.0/((index // 100) +  1)
        if np.random.rand(1) < e:
            action = pr.choice([0, 1, 2, 3])
        else:
            action = np.argmax(Q[position, :])

        new_position, reward, done, info = env.step(action)

        # print(f"new_position : {new_position}, reward : {reward}, done : {done}, info : {info}")

        Q[position, action ] = reward + discount * np.max(Q[new_position, :])

        position = new_position
        total_reward += reward
        if done == True:
            break
    total_reward_list.append([index, total_reward])
    print(f"Q: {Q}")
    print("="* 100)
    print(f"total_reward = {total_reward}")

## 랜덤한 상황에서는 학습이 제대로 진행이 안되는 것을 확인할 수 있음
## 성공 비율이 24/1000

In [0]:
import matplotlib.pyplot as plt
result_list = list(zip(*total_reward_list))
print(sum(result_list[1]))
plt.bar(range(len(total_reward_list)), result_list[1], color="blue")

## 랜덤으로 이동하는 경우가 발생하기 때문에 최단경로는 의미가 사라짐
## 그래서 Goal로 가는 확률이 높은 것(Goal과 가까운 위치로 향하는 것)을 Q 함수 값으로 설정한다

## 바로 이전에 수행한 행동과 지금 행동함으로써 발생하는 행동에 가중치를 주어 학습시킨다

In [0]:
Q = np.zeros([16, 4])
weight = 0.99
learning_rate = 0.85
total_reward_list = []
for index in range(2000):
    print(f"index : ", index)
    position = env.reset()
    total_reward = 0
    while True:
        # env.render()
       
        rand_arr = np.random.randn(1, 4)

        noise = rand_arr / (index + 1)

        action = np.argmax(Q[position, :] + noise)
        new_position, reward, done, info = env.step(action)

        # print(f"new_position : {new_position}, reward : {reward}, done : {done}, info : {info}")

        Q[position, action ] = ((1- learning_rate) * Q[position, action]) + (learning_rate * (reward + weight * np.max(Q[new_position, :])))

        position = new_position
        total_reward += reward
        if done == True:
            break
    total_reward_list.append([index, total_reward])
    print(f"Q: {Q}")
    print("="* 100)
    print(f"total_reward = {total_reward}")

In [0]:
import matplotlib.pyplot as plt
result_list = list(zip(*total_reward_list))
print(sum(result_list[1]))
plt.bar(range(len(total_reward_list)), result_list[1], color="blue")
print(f"score : {sum(result_list[1]) / len(result_list[1])}")