In [None]:
### Step 1: Setup and Dependencies

# Install dependencies if needed (uncomment the next line)
# !pip install gym torch numpy matplotlib

import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import deque



In [None]:
### Step 2: Define the Neural Network

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)



In [None]:
### Step 3: Initialize the Environment

env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Set up DQN
policy_net = DQN(state_dim, action_dim)
target_net = DQN(state_dim, action_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
criterion = nn.MSELoss()

epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
gamma = 0.99  # Discount factor

In [None]:
### Step 4: Preprocess the Input

def preprocess_state(state):
    return torch.tensor(state, dtype=torch.float32).unsqueeze(0)

In [None]:
### Step 5: Experience Replay Buffer

replay_buffer = deque(maxlen=2000)

def add_experience(state, action, reward, next_state, done):
    replay_buffer.append((state, action, reward, next_state, done))

def sample_experiences(batch_size=32):
    return random.sample(replay_buffer, batch_size)


In [None]:

### Step 6: Train the DQN

def train(batch_size=32):
    if len(replay_buffer) < batch_size:
        return
    
    batch = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    states = torch.cat([preprocess_state(s) for s in states])
    actions = torch.tensor(actions).unsqueeze(1)
    rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1)
    next_states = torch.cat([preprocess_state(s) for s in next_states])
    dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(1)
    
    q_values = policy_net(states).gather(1, actions)
    next_q_values = target_net(next_states).max(1, keepdim=True)[0].detach()
    target_q_values = rewards + gamma * next_q_values * (1 - dones)
    
    loss = criterion(q_values, target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
### Step 7: Train the Agent

num_episodes = 500
batch_size = 32
sync_target_steps = 10
episode_rewards = []

for episode in range(num_episodes):
    state = env.reset()
    state = state[0] if isinstance(state, tuple) else state
    total_reward = 0
    done = False
    
    while not done:
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = torch.argmax(policy_net(preprocess_state(state))).item()
        
        next_state, reward, done, _, _ = env.step(action)
        add_experience(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        train(batch_size)
    
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    episode_rewards.append(total_reward)
    
    if episode % sync_target_steps == 0:
        target_net.load_state_dict(policy_net.state_dict())
    
    if episode % 50 == 0:
        print(f"Episode {episode}: Reward = {total_reward}")


In [None]:
### Step 8: Visualize the Results

plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Training Progress')
plt.show()

### Step 9: Play the Game with the Trained Model

def play():
    state = env.reset()
    state = state[0] if isinstance(state, tuple) else state
    done = False
    total_reward = 0
    
    while not done:
        env.render()
        with torch.no_grad():
            action = torch.argmax(policy_net(preprocess_state(state))).item()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
    
    env.close()
    print(f'Total Reward: {total_reward}')

play()
