In [12]:

# frozen lake - v3 4x4

import gym
import numpy as np

# FrozenLake 환경 생성
env = gym.make('FrozenLake-v3', render_mode="human")

# Q-테이블 초기화
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

# 학습 파라미터 설정
num_episodes = 2000
max_steps_per_episode = 100
learning_rate = 0.1
discount_factor = 0.99
exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# 총 reward 값을 저장할 변수
total_rewards = 0

# Q-학습 알고리즘 실행
for episode in range(num_episodes):
    state = env.reset()
    state = state[0]
    done = False
    episode_reward = 0  # 에피소드의 총 보상
    
    for step in range(max_steps_per_episode):
        # 무작위로 탐색하거나, 학습된 Q-값에 따라 행동 선택
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(Q[state, :])
        else:
            action = env.action_space.sample()
        
        # 선택한 행동 수행 후 다음 상태와 보상 얻기
        new_state, reward, done, _, info = env.step(action)
        
        # Q-값 업데이트
        Q[state, action] = (1 - learning_rate) * Q[state, action] + learning_rate * (reward + discount_factor * np.max(Q[new_state, :]))
        
        state = new_state
        episode_reward += reward
        
        if done:
            break
    
    # 탐색률 감소
    exploration_rate = min_exploration_rate + (1 - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    # 에피소드의 총 보상 저장
    total_rewards += episode_reward

    # 매 1000 에피소드마다 총 reward 출력
    if episode % 100 == 0:
        print(f"Episode = {episode}, Episode Reward = {episode_reward}, Total Reward = {total_rewards}")

# 학습된 Q-테이블 출력
print(Q)
print("exploretion_rate", exploration_rate)
# 총 reward 값의 평균 출력
print("Average Total Reward:", np.sum(total_rewards))


Episode = 0, Episode Reward = 2.0, Total Reward = 2.0
Episode = 100, Episode Reward = 0.0, Total Reward = 10.0
Episode = 200, Episode Reward = 0.0, Total Reward = 22.0
Episode = 300, Episode Reward = 0.0, Total Reward = 46.0
Episode = 400, Episode Reward = 0.0, Total Reward = 64.0
Episode = 500, Episode Reward = 0.0, Total Reward = 92.0
Episode = 600, Episode Reward = 0.0, Total Reward = 158.0
Episode = 700, Episode Reward = 0.0, Total Reward = 236.0
Episode = 800, Episode Reward = 2.0, Total Reward = 316.0
Episode = 900, Episode Reward = 2.0, Total Reward = 422.0
Episode = 1000, Episode Reward = 0.0, Total Reward = 512.0
Episode = 1100, Episode Reward = 2.0, Total Reward = 614.0
Episode = 1200, Episode Reward = 0.0, Total Reward = 744.0
Episode = 1300, Episode Reward = 2.0, Total Reward = 882.0
Episode = 1400, Episode Reward = 2.0, Total Reward = 1004.0
Episode = 1500, Episode Reward = 2.0, Total Reward = 1138.0
Episode = 1600, Episode Reward = 0.0, Total Reward = 1290.0
Episode = 170

In [61]:
#Q 가져오기  4x4

import gym
import numpy as np

env = gym.make('FrozenLake-v3', render_mode="human")

# Q-테이블 초기화
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.array([[1.8829603,  1.86413069, 1.9019801,  1.8829603 ],
 [1.8829603,  0.       ,  1.92119202, 1.9019801 ],
 [1.9019801,  1.940598,   1.90198008, 1.92119202],
 [1.92119202, 0.      ,   1.70017954, 1.67209124],
 [1.8189107,  1.28173076, 0.      ,   1.8829603 ],
 [0.   ,      0.      ,   0.   ,      0.        ],
 [0.     ,    1.9602 ,    0.     ,    1.92119201],
 [0.      ,   0.     ,    0.      ,   0.        ],
 [0.66179537, 0.     ,    0.44207516, 1.75504762],
 [1.06261958, 1.96019933, 1.81209797, 0.        ],
 [1.94059086, 1.98 ,      0.  ,       1.94059705],
 [0.     ,    0.     ,    0.     ,    0.        ],
 [0.      ,   0.     ,    0.   ,      0.        ],
 [0.      ,   1.78163296, 1.98    ,   1.80069827],
 [1.96019826, 1.97999884, 2.    ,     1.96019982],
 [0.      ,   0.     ,    0.       ,  0.        ]])

# 학습 파라미터 설정
num_episodes = 2100
max_steps_per_episode = 100
learning_rate = 0.1
discount_factor = 0.99
exploration_rate = 0.14411597934795192
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# 총 reward 값을 저장할 변수
total_rewards = 0

# Q-학습 알고리즘 실행
for episode in range(2000,num_episodes):
    state = env.reset()
    state = state[0]
    done = False
    episode_reward = 0  # 에피소드의 총 보상
    
    for step in range(max_steps_per_episode):
        # 무작위로 탐색하거나, 학습된 Q-값에 따라 행동 선택
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(Q[state, :])
        else:
            action = env.action_space.sample()
        
        # 선택한 행동 수행 후 다음 상태와 보상 얻기
        new_state, reward, done, _, info = env.step(action)
        
        # Q-값 업데이트
        Q[state, action] = (1 - learning_rate) * Q[state, action] + learning_rate * (reward + discount_factor * np.max(Q[new_state, :]))
        
        state = new_state
        episode_reward += reward
        
        if done:
            break
    
    # 탐색률 감소
    exploration_rate = min_exploration_rate + (1 - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    # 에피소드의 총 보상 저장
    total_rewards += episode_reward

    # 매 1000 에피소드마다 총 reward 출력
    if episode % 10 == 0:
        print(f"Episode = {episode}, Episode Reward = {episode_reward}, Total Reward = {total_rewards}")

# 학습된 Q-테이블 출력
print(Q)

# 총 reward 값의 평균 출력
print("Average Total Reward:", np.sum(total_rewards))


Episode = 2000, Episode Reward = 2.0, Total Reward = 2.0
Episode = 2010, Episode Reward = 2.0, Total Reward = 20.0
Episode = 2020, Episode Reward = 2.0, Total Reward = 38.0
Episode = 2030, Episode Reward = 2.0, Total Reward = 54.0
Episode = 2040, Episode Reward = 2.0, Total Reward = 74.0
Episode = 2050, Episode Reward = 2.0, Total Reward = 92.0
Episode = 2060, Episode Reward = 2.0, Total Reward = 110.0
Episode = 2070, Episode Reward = 2.0, Total Reward = 128.0
Episode = 2080, Episode Reward = 2.0, Total Reward = 144.0
Episode = 2090, Episode Reward = 2.0, Total Reward = 160.0
[[1.8829603  1.86413069 1.9019801  1.8829603 ]
 [1.8829603  0.         1.92119202 1.9019801 ]
 [1.9019801  1.940598   1.90198009 1.92119202]
 [1.92119202 0.         1.70017954 1.67209124]
 [1.8234327  1.28173076 0.         1.8829603 ]
 [0.         0.         0.         0.        ]
 [0.         1.9602     0.         1.92119201]
 [0.         0.         0.         0.        ]
 [0.66179537 0.         0.44207516 1.7550

In [67]:
env.env

SyntaxError: invalid syntax (2951356808.py, line 1)

## 예외

In [2]:

# reward 오류
import numpy as np
import gym
import time

total_episodes = 50
learning_rate = 0.8
discount_factor = 0.95
epsilon = 1

env=gym.make("FrozenLake-v1", render_mode="human")
Q = np.zeros([env.observation_space.n, env.action_space.n])

for i in range(total_episodes):
    state, _ = env.reset()
    # state = state[0]
    done = False
    env.render()
    print(f'실행 횟수 = {i}')

    while not done:
        t = 0
        if np.random.uniform(0, 1) < epsilon:
            if epsilon > .05:
                epsilon -= .001
            action = env.action_space.sample()  # 무작위 행동 선택
        else:
            action = np.argmax(Q[state,:])  # 탐욕적 행동 선택
        
        next_state, reward, done, _, _ = env.step(action)
        
        # Q-Learning 업데이트
        Q[state, action] = (1 - learning_rate) * Q[state, action] + \
                           learning_rate * (reward + discount_factor * np.max(Q[next_state, :]))
        
        state = next_state
        t += 1

        if done and reward == 0.0:
            reward -= 1
        env.render()
        print('state: {} and action {} and reward {}'.format(state,action, reward))
        # print(Q)
        time.sleep(0.001)
        if done:
            break
    env.P[0][1]
    
env.close()

실행 횟수 = 0
state: 0 and action 1 and reward 0.0
state: 1 and action 3 and reward 0.0
state: 2 and action 3 and reward 0.0
state: 2 and action 3 and reward 0.0
state: 3 and action 3 and reward 0.0
state: 3 and action 3 and reward 0.0
state: 7 and action 1 and reward -1.0
실행 횟수 = 1
state: 4 and action 1 and reward 0.0
state: 4 and action 0 and reward 0.0
state: 0 and action 3 and reward 0.0
state: 1 and action 3 and reward 0.0
state: 2 and action 3 and reward 0.0
state: 1 and action 0 and reward 0.0
state: 0 and action 0 and reward 0.0
state: 0 and action 0 and reward 0.0
state: 0 and action 3 and reward 0.0
state: 1 and action 2 and reward 0.0
state: 1 and action 3 and reward 0.0
state: 5 and action 0 and reward -1.0
실행 횟수 = 2
state: 0 and action 3 and reward 0.0
state: 0 and action 0 and reward 0.0
state: 0 and action 1 and reward 0.0
state: 0 and action 3 and reward 0.0
state: 1 and action 3 and reward 0.0
state: 2 and action 1 and reward 0.0
state: 1 and action 3 and reward 0.0
state:

In [36]:
import gym
import numpy as np
import time

# Q-Learning 알고리즘을 위한 하이퍼파라미터 설정
total_episodes = 50000
learning_rate = 0.8
discount_factor = 0.95
epsilon = 0.1

# FrozenLake 환경 생성
env = gym.make("FrozenLake-v1")

# Q 테이블 초기화
Q = np.zeros([env.observation_space.n, env.action_space.n])

# 학습 과정
for episode in range(10000):
    state = env.reset()
    state = state[0]
    done = False
    
    while not done:
        # epsilon-greedy 정책에 따라 행동 선택
        if np.random.uniform(0, 1) > epsilon:
            action = env.action_space.sample()  # 무작위 행동 선택
            # print("if=>", action)
        else:
            action = np.argmax(Q[state,:])  # 탐욕적 행동 선택
            # print("else=>", action)
            
        print("=>", action)
        next_state, reward, done, _,_ = env.step(action)
        
        # Q-Learning 업데이트
        Q[state, action] = (1 - learning_rate) * Q[state, action] + \
                           learning_rate * (reward + discount_factor * np.max(Q[next_state, :]))
        
        state = next_state
        print('Iteration: and action {}'.format(action))
        time.sleep(0.001)
    
    if episode % 1000 == 0:
        print("Episode:", episode)

# 학습된 Q 테이블 출력
print("Learned Q table:")
print(Q)

env.close()


=> 2
Iteration: and action 2
=> 3
Iteration: and action 3
Episode: 0
=> 2
Iteration: and action 2
=> 3
Iteration: and action 3
=> 0
Iteration: and action 0
=> 0
Iteration: and action 0
=> 0
Iteration: and action 0
=> 3
Iteration: and action 3
=> 2
Iteration: and action 2
=> 3
Iteration: and action 3
=> 2
Iteration: and action 2
=> 0
Iteration: and action 0
=> 3
Iteration: and action 3
=> 3
Iteration: and action 3
=> 0
Iteration: and action 0
=> 0
Iteration: and action 0
=> 3
Iteration: and action 3
=> 3
Iteration: and action 3
=> 1
Iteration: and action 1
=> 1
Iteration: and action 1
=> 0
Iteration: and action 0
=> 1
Iteration: and action 1
=> 2
Iteration: and action 2
=> 2
Iteration: and action 2
=> 0
Iteration: and action 0
=> 2
Iteration: and action 2
=> 0
Iteration: and action 0
=> 0
Iteration: and action 0
=> 0
Iteration: and action 0
=> 3
Iteration: and action 3
=> 0
Iteration: and action 0
=> 3
Iteration: and action 3
=> 3
Iteration: and action 3
=> 1
Iteration: and action 1
=> 

KeyboardInterrupt: 

In [7]:
import numpy as np
import gym
import random

env = gym.make('FrozenLake-v0')
action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size, action_size))

total_episodes = 1000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

rewards = []
for episode in range(total_episodes):
    state = env.reset()
    state = state[0]
    total_rewards = 0

    for step in range(max_steps):
        exp_exp_tradeoff = random.uniform(0, 1)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state])
        else:
            action = env.action_space.sample()

        new_state, reward, done,_, info = env.step(action)
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state]) - qtable[state, action])

        state = new_state
        total_rewards += reward
        if done: break

    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1))
    rewards.append(total_rewards)

    print('[*] episode {}, total reward {}, average score {}'.format(episode, total_rewards, sum(rewards)/(episode+1)))

print(qtable)

# Play the game

for episode in range(1):
    state = env.reset()
    state = state[0]
    print('*'*20)
    print('EPISODE ', episode)

    for step in range(max_steps):
        env.render()
        action = np.argmax(qtable[state])
        input()
        state, reward, done, _,info = env.step(action)
        if done: break

env.close()


  logger.warn(


DeprecatedEnv: Environment version v0 for `FrozenLake` is deprecated. Please use `FrozenLake-v3` instead.

In [37]:

p = min(1, 0.8)
board = np.random.choice(["F", "H"], (8, 8), p=[p, 1 - p])
# print(board[0][0])

n = 0
hole = []
for i in board:
    for j in range(len(i)):
        if i[j] == "H":
            hole.append([n,j])
            # print(i)
            # print()
    n +=1

In [38]:
print(hole)

[[0, 2], [1, 4], [1, 5], [2, 0], [4, 3], [5, 4], [5, 5], [6, 0], [6, 2], [6, 4], [7, 1], [7, 5]]
