In [1]:
import gym
import random
import numpy as np
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

2023-01-31 15:01:33.594464: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

## before training

In [4]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    # env.reset(): Step을 실행하다가 epsiode가 끝나서 이를 초기화해서 재시작해야할 때, 초기 State를 반환
    # 새로운 에피소드(initial environment)를 불러온다.(reset)
    done = False
    score = 0
    
    while not done:
        env.render()
        # env.render(): Graphic User Interface (GUI)로 현재 진행상황을 출력하는 함수
        # 행동(action)을 취하기 이전에 환경에 대해 얻은 관찰값(observation)적용하여 그린다.
        action = random.choice([0, 1])
        # random.choice([0, 1]): 0과 1중 하나를 랜덤으로 뽑아준다.
        n_state, reward, done, info = env.step(action)
        # env.step(): 행동(action)을 취하기 이후에 환경에 대해 얻은 관찰값(observation)적용하여 제어
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:18.0
Episode:2 Score:15.0
Episode:3 Score:19.0
Episode:4 Score:41.0
Episode:5 Score:50.0
Episode:6 Score:13.0
Episode:7 Score:35.0
Episode:8 Score:27.0
Episode:9 Score:21.0
Episode:10 Score:15.0


### build model

In [6]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape = (1, states)))
    model.add(Dense(24, activation = 'relu'))
    # 출력 뉴런수 24개, 활성화 함수 'relu'
    model.add(Dense(24, activation = 'relu'))
    model.add(Dense(actions, activation = 'linear'))
    
    return model

In [7]:
model = build_model(states, actions)

### build agent

In [9]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit = 10000, window_length = 1)
    dqn = DQNAgent(model = model, memory = memory, policy = policy, 
                   nb_actions = actions, nb_steps_warmup = 10, target_model_update = 1e-2)
    
    return dqn


### training model

In [11]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 22.814 seconds


<keras.callbacks.History at 0x7f7d608d7310>

### test model

In [13]:
scores = dqn.test(env, nb_episodes = 10, visualize = True)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
