In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

class FlappyBirdMultiAgentRL:
    def __init__(self, num_agents=3, learning_rate=0.001, discount_factor=0.99):
        self.env = gym.make('FlappyBird-v0')
        self.num_agents = num_agents
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        
        # Initialize neural networks for each agent
        self.agents = [self._build_network() for _ in range(num_agents)]
        self.optimizers = [optim.Adam(agent.parameters(), lr=learning_rate) for agent in self.agents]
    
    def _build_network(self):
        return nn.Sequential(
            nn.Linear(4, 64),  # Assuming 4 input features (bird pos, pipe positions)
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)  # Output: actions (flap or not)
        )
    
    def choose_action(self, state):
        actions = []
        for agent in self.agents:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state)
                action_probs = torch.softmax(agent(state_tensor), dim=0)
                action = torch.multinomial(action_probs, 1).item()
                actions.append(action)
        
        # Collective decision making: majority vote
        return max(set(actions), key=actions.count)
    
    def train(self, episodes=1000):
        for episode in range(episodes):
            state, _ = self.env.reset()
            done = False
            total_reward = 0
            
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _, _ = self.env.step(action)
                
                # Cooperative reward adjustment
                cooperative_reward = reward + (5 if not done else -100000)
                
                # Update each agent
                for i, agent in enumerate(self.agents):
                    self._update_agent(agent, self.optimizers[i], state, action, cooperative_reward, next_state, done)
                
                state = next_state
                total_reward += cooperative_reward
            
            if episode % 50 == 0:
                print(f"Episode {episode}, Total Reward: {total_reward}")
    
    def _update_agent(self, agent, optimizer, state, action, reward, next_state, done):
        # Standard Q-learning update
        state_tensor = torch.FloatTensor(state)
        next_state_tensor = torch.FloatTensor(next_state)
        
        current_q = agent(state_tensor)[action]
        next_max_q = torch.max(agent(next_state_tensor)) if not done else 0
        
        target_q = reward + self.discount_factor * next_max_q
        loss = nn.MSELoss()(current_q, torch.tensor(target_q))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def main():
    np.random.seed(42)
    torch.manual_seed(42)
    
    multi_agent_trainer = FlappyBirdMultiAgentRL(
        num_agents=3, 
        learning_rate=0.001, 
        discount_factor=0.99
    )
    
    multi_agent_trainer.train(episodes=1000)

if __name__ == "__main__":
    main()