# FROM CHATGPT

In [None]:
import random
import numpy as np
import tensorflow as tf

class DQNAgent:
    def __init__(self, state_size, action_size, gamma=0.95, learning_rate=0.001, batch_size=32, replay_memory_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.replay_memory_size = replay_memory_size
        self.replay_memory = []
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(32, activation='relu', input_dim=self.state_size))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))
        if len(self.replay_memory) > self.replay_memory_size:
            self.replay_memory.pop(0)

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def learn(self):
        if len(self.replay_memory) < self.batch_size:
            return
        batch = random.sample(self.replay_memory, self.batch_size)
        states = np.array([transition[0] for transition in batch])
        actions = np.array([transition[1] for transition in batch])
        rewards = np.array([transition[2] for transition in batch])
        next_states = np.array([transition[3] for transition in batch])
        dones = np.array([transition[4] for transition in batch])
        q_values = self.model.predict(states)
        target_q_values = self.target_model.predict(next_states)
        max_target_q_values = np.max(target_q_values, axis=1)
        target_q_values[dones] = 0
        targets = rewards + self.gamma * max_target_q_values
        indices = np.arange(self.batch_size)
        q_values[indices, actions] = targets
        self.model.fit(states, q_values, batch_size=self.batch_size, verbose=0)