## 一个简单的猜数字是大于50还是小于50的强化学习案例

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random

# 环境类：生成0到100的随机数字，模型预测是大于还是小于50
class NumberGuessEnv:
    def __init__(self):
        self.number = None  # 当前的数字
        self.action_space = [0, 1]  # 动作空间：0代表 < 50，1代表 >= 50

    def reset(self):
        self.number = random.randint(0, 100)  # 随机生成0-100的数字
        return self.number

    def step(self, action):
        done = True
        # 根据动作判断奖励，正确预测返回+1奖励，错误预测返回-1
        if (self.number < 50 and action == 0) or (self.number >= 50 and action == 1):
            reward = 1
        else:
            reward = -1
        return reward, done

# DQN智能体
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size  # 输入空间
        self.action_size = action_size  # 动作空间
        self.memory = []  # 经验回放池
        self.gamma = 0.95  # 折扣因子
        self.epsilon = 1.0  # 探索率
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # 创建一个简单的神经网络模型
        model = tf.keras.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        # 存储经验
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # epsilon-greedy 策略
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(np.array([state]))
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        # 经验回放，从存储的记忆中抽样训练
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(np.array([next_state]))[0]))
            target_f = self.model.predict(np.array([state]))
            target_f[0][action] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [5]:
env = NumberGuessEnv()
state_size = 1  # 输入是一个数字
action_size = 2  # 动作：0 < 50，1 >= 50
agent = DQNAgent(state_size, action_size)
episodes = 1000
batch_size = 32

for e in range(episodes):
    state = env.reset()  # 初始化环境
    for time in range(1):
        action = agent.act(state)  # 根据当前策略选择动作
        reward, done = env.step(action)  # 执行动作，获得奖励
        next_state = env.reset()  # 获取下一个状态
        agent.remember(state, action, reward, next_state, done)  # 存储经验
        state = next_state  # 转换到下一个状态
        if done:
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)  # 回放经验进行训练

    if e % 100 == 0:
        print(f"episode: {e}/{episodes}, epsilon: {agent.epsilon:.2}")

episode: 0/1000, epsilon: 1.0


KeyboardInterrupt: 