In [30]:
import gymnasium as gym
import numpy as np
import random
import tensorflow as tf
from collections import deque
import os
from keras.models import load_model

In [35]:
class agent1():
    def create_model(self):
        neural_n= tf.keras.Sequential([
            tf.keras.layers.Dense(512, input_shape=(4,), activation="relu"),
            tf.keras.layers.Dense(512, activation="relu"),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(2, activation="linear")
        ])

        neural_n.compile(optimizer="adam", loss="mse", metrics=["accuracy"])

        return neural_n

    def __init__(self):
        self.env = gym.make("CartPole-v1")
        self.main_nn = self.create_model()
        self.discount_rate=0.9
        self.e=1
        self.decay=0.001
        self.eps= 1000
        self.train_start = 1000
        self.batch_size=64
        self.memory = deque(maxlen=2000)


    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory)> self.train_start:
            self.e-=self.decay

    def act(self, state):
        if random.uniform(0,1)< self.e:
            action = env.action_space.sample()
        else:
            action = np.argmax(self.main_nn.predict(state, verbose=0))

        return action
    
    def untuple(self, state):
        if isinstance(state, tuple):
            state=state[0]

        return state
    
    def replay_memory(self):
        if len(self.memory) < self.train_start:
            return
        
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        states = np.zeros((len(minibatch), 4))
        next_states = np.zeros((len(minibatch), 4))
        actions, rewards, dones = [], [], []

        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            states[i] = self.untuple(state)
            next_states[i]= self.untuple(next_state)
            actions.append(int(action))
            rewards.append(reward)
            dones.append(done)

        targets=self.main_nn.predict(states, verbose=0)
        targets_next = self.main_nn.predict(next_states, verbose=0)

        for i in range(self.batch_size):
            if dones[i]:
                targets[i][actions[i]]=rewards[i]
            else:
                targets[i][actions[i]]=rewards[i]+self.discount_rate*np.max(targets_next[i])

        self.main_nn.fit(states, targets, batch_size=self.batch_size, verbose=0)

    def load(self, name):
        self.model = load_model(name)

    def save(self, name):
        self.main_nn.save(name)

    def run(self):
        training=True
        for episode in range(self.eps):
            state = self.env.reset()
            state = self.untuple(state)
            state = np.array(state).reshape((1,4))
            done=False
            i=0
            if not training:
                break
            while not done:
                i+=1
                action= self.act(state)
                next_state, reward, done, info, _ = self.env.step(action)
                if (done and i!=self.env._max_episode_steps-1)or abs(state[0][0])>1:
                    reward = -100
                    done=True
                
                next_state=self.untuple(next_state)
                next_state= np.array(next_state).reshape((1,4))
                self.remember(state, action, reward, next_state, done)
                state = next_state

                if done:
                    print("episode: {}/{}, score: {}".format(episode, self.eps, i))

                    
                    if i>=500:
                        training= False
                        # self.save("cartpole.h5")
                        break

                self.replay_memory()


    def test(self):
        env = gym.make("Cartpole-v1", render_mode="human")
        self.load("cartpole.h5")
        for episode in range(10):
            state = env.reset()
            state = self.untuple(state)
            state = np.array(state).reshape((1,4))
            done = False
            while not done:
                action = np.argmax(self.main_nn.predict(state, verbose=0))
                next_state, action, reward, done, info = env.step(action)
                state = np.array(self.untuple(next_state)).reshape((1,4))

In [36]:
cart_pole = agent1()
cart_pole.run()

episode: 0/1000, score: 12
episode: 1/1000, score: 19
episode: 2/1000, score: 18
episode: 3/1000, score: 15
episode: 4/1000, score: 12
episode: 5/1000, score: 30
episode: 6/1000, score: 23
episode: 7/1000, score: 11
episode: 8/1000, score: 23
episode: 9/1000, score: 30
episode: 10/1000, score: 11
episode: 11/1000, score: 38
episode: 12/1000, score: 44
episode: 13/1000, score: 14
episode: 14/1000, score: 11
episode: 15/1000, score: 26
episode: 16/1000, score: 27
episode: 17/1000, score: 19
episode: 18/1000, score: 21
episode: 19/1000, score: 28
episode: 20/1000, score: 16
episode: 21/1000, score: 22
episode: 22/1000, score: 15
episode: 23/1000, score: 11
episode: 24/1000, score: 12
episode: 25/1000, score: 21
episode: 26/1000, score: 33
episode: 27/1000, score: 13
episode: 28/1000, score: 21
episode: 29/1000, score: 9
episode: 30/1000, score: 16
episode: 31/1000, score: 62
episode: 32/1000, score: 12
episode: 33/1000, score: 24
episode: 34/1000, score: 37
episode: 35/1000, score: 8
epis

KeyboardInterrupt: 

In [None]:
cart_pole.save("final.keras")

In [37]:


env = gym.make("CartPole-v1", render_mode="human")
for episode in range(10):
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]
    state= np.array(state).reshape((1,4))
    for step in range(1000):
        action = np.argmax(cart_pole.main_nn.predict(state, verbose=0))
        next_state, reward, action, done, _ = env.step(action)
        if isinstance(next_state, tuple):
            next_state = next_state[0]
        state= np.array(next_state).reshape((1,4))

        if done or abs(state[0][0])>1:
            print("episode:{}, score:{}".format(episode, step))
            break

episode:0, score:83


  logger.warn(


episode:1, score:111
episode:2, score:111
episode:3, score:113
episode:4, score:72
episode:5, score:76
episode:6, score:74
episode:7, score:115
episode:8, score:115
episode:9, score:114
