## Installing libraries 

In [1]:
!pip install tensorflow
!pip install gym
!pip install keras
!pip install keras-rl2

Collecting tensorflow
  Downloading tensorflow-2.13.0rc0-cp311-cp311-macosx_12_0_arm64.whl (2.0 kB)
Collecting tensorflow-macos==2.13.0-rc0 (from tensorflow)
  Downloading tensorflow_macos-2.13.0rc0-cp311-cp311-macosx_12_0_arm64.whl (189.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:04[0m
[?25hCollecting absl-py>=1.0.0 (from tensorflow-macos==2.13.0-rc0->tensorflow)
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Collecting astunparse>=1.6.0 (from tensorflow-macos==2.13.0-rc0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.1.21 (from tensorflow-macos==2.13.0-rc0->tensorflow)
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-macos==2.13.0-rc0->tensorflow)
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-macos==2.13.0-rc0->te

## Importing Simulation library gym 

In [22]:
import gym
import random

## Creating FrozenLake environment 

In [24]:
env = gym.make("FrozenLake-v1", render_mode="human")
env.reset()
env.render()

## Importing libraries for Neural Network 

In [25]:
import numpy as np
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.optimizers import Adam

### `discount_factor`: It represents the discount factor used in the calculation of the target value for updating the neural network. 
### `eps:` It represents the exploration rate or epsilon value. In the code, it is used to determine the probability of taking a random action during the agent's interaction with the environment. A higher value of eps increases the likelihood of exploration rather than exploitation.
### `eps_decay_factor:` It represents the decay factor applied to the exploration rate (eps) after each episode. It reduces the exploration rate over time, allowing the agent to gradually shift from exploration to exploitation. A value less than 1 (but close to 1) is typically used to decay eps over episodes.
### `num_episodes:` It represents the number of episodes or iterations for which the agent interacts with the environment and learns from it. Each episode consists of multiple steps, and the agent updates its neural network based on the observed rewards and transitions.

In [26]:
discount_factor = 0.95
eps = 0.5
eps_decay_factor = 0.999
num_episodes = 100

## Creating model

In [27]:
model = Sequential()
model.add(InputLayer(batch_input_shape=(1, env.observation_space.n)))
model.add(Dense(20, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [28]:
for i in range(num_episodes):
    state = env.reset()[0]
    eps *= eps_decay_factor
    done = False
    while not done:
        if np.random.random() < eps:
            action = np.random.randint(0, env.action_space.n)
        else:
            action = np.argmax(model.predict(np.identity(env.observation_space.n)[state:state + 1]))
        step_result = env.step(action)
        new_state = step_result[0]
        reward = step_result[1]
        done = step_result[2]

        target = reward + discount_factor * np.max(model.predict(np.identity(env.observation_space.n)[new_state:new_state + 1]))
        target_vector = model.predict(np.identity(env.observation_space.n)[state:state + 1])[0]
        target_vector[action] = target
        model.fit(np.identity(env.observation_space.n)[state:state + 1], 
          target_vector.reshape(-1, env.action_space.n), 
          epochs=1, verbose=0)
        state = new_state

env.close()

