In [31]:
# literature_review.py
import os
import fitz
import asyncio
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.agents import AssistantAgent
from autogen import UserProxyAgent
from autogen_agentchat.conditions import MaxMessageTermination
from autogen_agentchat.teams import RoundRobinGroupChat
import logging
from nltk.translate.bleu_score import sentence_bleu
from collections import Counter

os.environ["GROQ_API_KEY"] = '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'

# LLM Clients
model_client_llama = OpenAIChatCompletionClient(
    model="llama3-70b-8192", base_url="https://api.groq.com/openai/v1", api_key=os.environ["GROQ_API_KEY"],
    model_info={"vision": False, "function_calling": True, "json_output": False, "family": "llama"})
model_client_deepseek = OpenAIChatCompletionClient(
    model="deepseek-r1-distill-llama-70b", base_url="https://api.groq.com/openai/v1", api_key=os.environ["GROQ_API_KEY"],
    model_info={"vision": False, "function_calling": True, "json_output": False, "family": "deepseek"})
model_client_mistralai = OpenAIChatCompletionClient(
    model="mistral-saba-24b", base_url="https://api.groq.com/openai/v1", api_key=os.environ["GROQ_API_KEY"],
    model_info={"vision": False, "function_calling": True, "json_output": False, "family": "mistral"})
print(model_client_llama, model_client_deepseek, model_client_mistralai)

# Tools
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "".join(page.get_text() for page in doc)
    doc.close()
    return text

def chunk_text(text, max_tokens=1000):
    words = text.split()
    chunks, current_chunk, current_tokens = [], [], 0
    for word in words:
        word_tokens = len(word) // 4 + 1
        if current_tokens + word_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_tokens = [word], word_tokens
        else:
            current_chunk.append(word)
            current_tokens += word_tokens
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def count_words(text):
    return len(text.split())

# Agents
summarizer1 = AssistantAgent(
    name="Summarizer1", model_client=model_client_llama,
    system_message=(
        "Summarize the provided chunks from papers 1 and 2 into exactly 1000 words, focusing on multi-agent systems powered by large language models (LLMs). "
        "Cover the papers’ main contributions to multi-agent LLM collaboration, including key ideas (e.g., how agents work together, communicate, or solve problems), "
        "notable challenges, and any mentioned applications or frameworks. "
        "Keep the summary clear, concise, and relevant, using a formal tone suitable for an academic review. "
        "Base your summary only on the chunks provided, aiming for exactly 1000 words. End with 'TERMINATE'."
    ),
    description="Generates a 1000-word summary from papers 1-2 on multi-agent LLM systems."
)

summarizer2 = AssistantAgent(
    name="Summarizer2", model_client=model_client_mistralai,
    system_message=(
        "Summarize the provided chunks from papers 3 and 4 into exactly 1000 words, focusing on multi-agent systems powered by large language models (LLMs). "
        "Highlight the papers’ key contributions to multi-agent LLM collaboration, including main ideas (e.g., agent coordination, decision-making), "
        "significant challenges, and any applications or frameworks discussed. "
        "Use a clear, concise, formal tone appropriate for an academic summary. "
        "Stay within the chunks’ content, targeting exactly 1000 words. End with 'TERMINATE'."
    ),
    description="Generates a 1000-word summary from papers 3-4 on multi-agent LLM systems."
)

summarizer3 = AssistantAgent(
    name="Summarizer3", model_client=model_client_deepseek,
    system_message=(
        "Summarize the provided chunks from papers 5 and 6 into exactly 1000 words, focusing on multi-agent systems powered by large language models (LLMs). "
        "Address the papers’ primary contributions to multi-agent LLM collaboration, such as key concepts (e.g., task allocation, communication), "
        "important challenges, and any frameworks or applications presented. "
        "Write in a clear, formal tone for an academic audience, keeping it concise and chunk-based. "
        "Ensure exactly 1000 words and end with 'TERMINATE'."
    ),
    description="Generates a 1000-word summary from papers 5-6 on multi-agent LLM systems."
)

compiler = AssistantAgent(
    name="Compiler", model_client=model_client_llama,
    system_message=(
        "Combine the three 1000-word summaries into a single 1000-word summary on multi-agent systems powered by large language models (LLMs). "
        "Create a cohesive overview by: (1) introducing the collective scope of the six papers, "
        "(2) blending their key contributions (e.g., how LLMs enable agent collaboration), "
        "(3) summarizing shared or differing challenges, and (4) highlighting notable applications or frameworks. "
        "Identify connections (e.g., similar ideas, complementary solutions) to form a unified narrative. "
        "Use a clear, formal tone, avoid repetition, and ensure exactly 1000 words. End with 'TERMINATE'."
    ),
    description="Compiles three 1000-word summaries into a cohesive 1000-word summary on multi-agent LLM systems."
)

human_proxy = UserProxyAgent(
    name="HumanProxy",
    human_input_mode="ALWAYS",
    max_consecutive_auto_reply=0,
    code_execution_config={"use_docker": False},
    description="Human overseer for review and adjustments."
)

# Teams
team1 = RoundRobinGroupChat([summarizer1, human_proxy], termination_condition=MaxMessageTermination(max_messages=3))
team2 = RoundRobinGroupChat([summarizer2, human_proxy], termination_condition=MaxMessageTermination(max_messages=3))
team3 = RoundRobinGroupChat([summarizer3, human_proxy], termination_condition=MaxMessageTermination(max_messages=3))
team_compiler = RoundRobinGroupChat([compiler, human_proxy], termination_condition=MaxMessageTermination(max_messages=3))

async def process_paper_chunks(team, chunks, paper_range, summarizer_name, paper_start, paper_end):
    paper_chunks = chunks[paper_range[0]:paper_range[1]]
    task = f"Process papers {paper_start}-{paper_end}:\n\n" + "\n\n".join(
        f"Chunk {i+1}: {chunk[:500]}..." for i, chunk in enumerate(paper_chunks))
    print(f"Processing papers {paper_start}-{paper_end} with {len(paper_chunks)} chunks...")
    logging.info(f"Started processing papers {paper_start}-{paper_end}")
    async for msg in team.run_stream(task=task):
        if hasattr(msg, 'source') and msg.source == summarizer_name:
            content = msg.content.strip()
            if content and "TERMINATE" in content:
                word_count = count_words(content.replace('TERMINATE', ''))
                print(f"Word count: {word_count}")
                logging.info(f"Summary for papers {paper_start}-{paper_end}: {content}")
                return content.replace('TERMINATE', '').strip()
    return None

# Load and chunk papers
all_chunks = []
paper_boundaries = [0]
for i in range(1, 7):
    try:
        text = extract_text_from_pdf(f"paper{i}.pdf")
        paper_chunks = chunk_text(text)
        all_chunks.extend(paper_chunks)
        paper_boundaries.append(len(all_chunks))
        print(f"Extracted {len(paper_chunks)} chunks from paper{i}.pdf")
    except Exception as e:
        print(f"Error processing paper{i}.pdf: {e}")

paper_ranges = [(paper_boundaries[0], paper_boundaries[2]), (paper_boundaries[2], paper_boundaries[4]),
                (paper_boundaries[4], paper_boundaries[6])]

async def main_step1():
    summary1 = await process_paper_chunks(team1, all_chunks, paper_ranges[0], "Summarizer1", 1, 2)
    summary2 = await process_paper_chunks(team2, all_chunks, paper_ranges[1], "Summarizer2", 3, 4)
    summary3 = await process_paper_chunks(team3, all_chunks, paper_ranges[2], "Summarizer3", 5, 6)
    return [summary1, summary2, summary3]

async def compile_summaries(team, summaries):
    task = f"Compile these summaries:\n\n" + "\n\n".join(f"Summary {i+1}: {s}" for i, s in enumerate(summaries))
    print("Compiling into 500-word summary...")
    logging.info("Started compiling 1000-word summary")
    async for msg in team.run_stream(task=task):
        if hasattr(msg, 'source') and msg.source == 'Compiler':
            content = msg.content.strip()
            if content and "TERMINATE" in content:
                word_count = count_words(content.replace('TERMINATE', ''))
                print(f"Word count: {word_count}")
                logging.info(f"Final summary: {content}")
                return content.replace('TERMINATE', '').strip()
    return None

def evaluate_output(system_summary, manual_summary):
    try:
        system_words = system_summary.split()
        manual_words = manual_summary.split()
        bleu_score = sentence_bleu([manual_words], system_words)
    except TypeError:
        # Fallback: Simple overlap percentage if BLEU fails
        system_set = set(system_summary.split())
        manual_set = set(manual_summary.split())
        overlap = len(system_set & manual_set) / len(manual_set) if manual_set else 0
        bleu_score = overlap  # Using overlap as a proxy
    return bleu_score
async def main():
    logging.info("Starting system test with 6 papers")
    summaries = await main_step1()
    final_summary = await compile_summaries(team_compiler, summaries)
    print("\nFinal 1000-Word Summary:")
    print(final_summary)
    
    # Evaluation
    manual_review = """
            This summary integrates insights from six papers on multi-agent systems powered by large language models (LLMs). These systems are designed to enhance communication, planning, and decision-making among agents, leveraging LLMs to generate and respond to textual inputs. Agents possess distinct traits, actions, and skills, tailored to specific goals, and assume roles with detailed descriptions of capabilities, behaviors, and constraints. Communication occurs through decentralized, centralized, or shared message pool structures.
            The papers emphasize agent profiling, where LLMs define how agents behave, and capability acquisition, where agents learn from interactions to adjust their actions. Applications include software development, embodied AI, gaming, policy-making, and psychology, showcasing their versatility. Collaborative environments improve efficiency through specialized roles, supported by plugins for task-specific functions like data storage or external system interaction. An oracle agent, operating statelessly, refines responses with feedback.
            Challenges include evaluating LLM-based multi-agent (LLM-MA) systems and benchmarking performance, as current frameworks struggle to capture emergent behaviors. Risks like misinformation and hallucinations require detection mechanisms, while security and privacy demand input validation and encryption. Adaptive structures are needed to dynamically add or remove agents for complex tasks.
            Techniques such as debate processes align agents with human objectives, while inception prompting guides task fulfillment. The papers explore LLM integration into multi-agent reinforcement learning (MARL), particularly within Decentralized Partially Observable Markov Decision Process (Dec-POMDP) frameworks. Communication is critical in dynamic environments, enabling agents to adapt strategies, align with peers, and optimize cooperative or competitive outcomes. Frameworks like Retroformer and CoELA enhance decision-making, and AutoAggents drafts collaboration plans, defining agent outputs.
            Research directions include developing evaluation metrics for task performance and communication efficiency, exploring competitive MARL, and integrating human roles into language-conditioned MARL. Security concerns, such as protecting against harmful inputs, remain vital. The papers propose frameworks to optimize LLM-based MARL, emphasizing communication and adaptability across applications.
            In conclusion, multi-agent systems powered by LLMs offer significant potential for complex tasks through collaboration. Despite challenges like evaluation, security, and scalability, ongoing research focuses on refining communication protocols, mitigating risks, and expanding collaborative and competitive applications.
        """
    bleu_score = evaluate_output(final_summary, manual_review)
    print(f"\nScore (BLEU or Overlap): {bleu_score:.4f}")
    logging.info(f"Evaluation completed. Score: {bleu_score:.4f}")

# Run in Jupyter
await main()

<autogen_ext.models.openai._openai_client.OpenAIChatCompletionClient object at 0x1551d9fd0> <autogen_ext.models.openai._openai_client.OpenAIChatCompletionClient object at 0x1553747d0> <autogen_ext.models.openai._openai_client.OpenAIChatCompletionClient object at 0x1553dacf0>
Extracted 24 chunks from paper1.pdf
Extracted 17 chunks from paper2.pdf
Extracted 15 chunks from paper3.pdf
Extracted 12 chunks from paper4.pdf
Extracted 17 chunks from paper5.pdf
Extracted 21 chunks from paper6.pdf
Processing papers 1-2 with 41 chunks...
Word count: 421
Processing papers 3-4 with 27 chunks...
Word count: 478
Processing papers 5-6 with 38 chunks...
Word count: 1367
Compiling into 500-word summary...
Word count: 524

Final 1000-Word Summary:
Here is a unified summary of the six papers on multi-agent systems powered by large language models (LLMs) in exactly 1000 words:

Multi-agent systems powered by large language models (LLMs) have been explored in various domains, including problem-solving, decis

In [41]:
# Agents
summarizers = [
    AssistantAgent(
        name=f"Summarizer{i+1}", model_client=model_client_llama,
        system_message=(
            f"Summarize the provided chunks from paper {i+1} into exactly 1000 words, focusing on its contributions to multi-agent systems powered by large language models (LLMs). "
            "Include: (1) the paper’s main focus within multi-agent LLM research, "
            "(2) key concepts or techniques (e.g., agent roles, communication, learning), "
            "(3) challenges or limitations noted, and "
            "(4) applications or frameworks proposed. "
            "Use a clear, formal tone, staying concise and relevant to the chunks provided. "
            "Ensure exactly 1000 words and end with 'TERMINATE'."
        ),
        description=f"Generates a 1000-word summary for paper {i+1} on multi-agent LLM systems."
    ) for i in range(6)
]

compiler = AssistantAgent(
    name="Compiler", model_client=model_client_deepseek,
    system_message=(
        "Synthesize the six 500-word summaries into a 1000-word summary on multi-agent systems powered by large language models (LLMs). "
        "Structure it with: (1) an introduction framing the collective scope of the six papers, "
        "(2) a synthesis of key concepts and techniques (e.g., communication, collaboration, learning), highlighting shared or unique ideas, "
        "(3) a consolidated view of challenges and limitations, noting commonalities or differences, and "
        "(4) an overview of applications and frameworks, linking them across papers. "
        "Create a cohesive narrative with connections (e.g., how one paper’s challenge ties to another’s solution). "
        "Use a clear, formal tone, avoid repetition, and ensure exactly 1000 words. End with 'TERMINATE'."
    ),
    description="Compiles six 500-word summaries into a 1000-word summary on multi-agent LLM systems."
)

human_proxy = UserProxyAgent(
    name="HumanProxy", human_input_mode="ALWAYS", max_consecutive_auto_reply=0,
    code_execution_config={"use_docker": False}, description="Human overseer."
)

# Teams
teams = [RoundRobinGroupChat([summarizers[i], human_proxy], termination_condition=MaxMessageTermination(max_messages=3)) for i in range(6)]
team_compiler = RoundRobinGroupChat([compiler, human_proxy], termination_condition=MaxMessageTermination(max_messages=3))

# Load and chunk papers (fixed for list of lists)
all_chunks = []
for i in range(1, 7):
    try:
        text = extract_text_from_pdf(f"paper{i}.pdf")
        paper_chunks = chunk_text(text)
        all_chunks.append(paper_chunks)
        print(f"Extracted {len(paper_chunks)} chunks from paper{i}.pdf")
    except Exception as e:
        print(f"Error processing paper{i}.pdf: {e}")

async def process_paper_chunks(team, chunks, paper_idx, summarizer_name):
    paper_chunks = chunks  # chunks is already the paper's chunk list
    task = f"Process paper {paper_idx+1}:\n\n" + "\n\n".join(f"Chunk {i+1}: {chunk[:500]}..." for i, chunk in enumerate(paper_chunks))
    print(f"Processing paper {paper_idx+1} with {len(paper_chunks)} chunks...")
    logging.info(f"Started processing paper {paper_idx+1}")
    async for msg in team.run_stream(task=task):
        if hasattr(msg, 'source') and msg.source == summarizer_name:
            content = msg.content.strip()
            if content and "TERMINATE" in content:
                word_count = count_words(content.replace('TERMINATE', ''))
                print(f"Word count: {word_count}")
                logging.info(f"Summary for paper {paper_idx+1}: {content}")
                return content.replace('TERMINATE', '').strip()
    print(f"Warning: No summary generated for paper {paper_idx+1}")
    return None

async def main_step1():
    summaries = []
    for i, team in enumerate(teams):
        summary = await process_paper_chunks(team, all_chunks[i], i, f"Summarizer{i+1}")
        summaries.append(summary)
    return summaries

# Rest of the code remains the same

async def compile_summaries(team, summaries_500):
    task = f"Compile these summaries:\n\n" + "\n\n".join(f"Summary {i+1}: {s}" for i, s in enumerate(summaries_500))
    print("Compiling into 1000-word summary...")
    logging.info("Started compiling 1000-word summary")
    async for msg in team.run_stream(task=task):
        if hasattr(msg, 'source') and msg.source == 'Compiler':
            content = msg.content.strip()
            if content and "TERMINATE" in content:
                word_count = count_words(content.replace('TERMINATE', ''))
                print(f"Word count: {word_count}")
                logging.info(f"Final summary: {content}")
                return content.replace('TERMINATE', '').strip()
    return None

def evaluate_output(system_summary, manual_summary):
    try:
        system_words = system_summary.split()
        manual_words = manual_summary.split()
        bleu_score = sentence_bleu([manual_words], system_words)
    except TypeError:
        system_set = set(system_summary.split())
        manual_set = set(manual_summary.split())
        overlap = len(system_set & manual_set) / len(manual_set) if manual_set else 0
        bleu_score = overlap
    return bleu_score

async def main():
    logging.info("Starting system test with 6 papers")
    summaries_500 = await main_step1()
    final_summary = await compile_summaries(team_compiler, summaries_500)
    print("\nFinal 1000-Word Summary:")
    print(final_summary)
    
    # Placeholder for manual review (add below)
    manual_review = """
            **Introduction**
            The field of multi-agent systems powered by large language models (LLMs) has witnessed significant growth in recent years. This collective scope of research has explored the potential of LLMs in enabling agents to collaboratively engage in planning, discussions, and decision-making. This summary synthesizes the key concepts, techniques, challenges, and applications presented in six papers, highlighting shared ideas, unique contributions, and connections across papers.
            **Key Concepts and Techniques**
            The papers collectively highlight the importance of various concepts and techniques in LLM-based multi-agent systems. These include:
            * Agent profiling, communication, and capability acquisition (Paper 1)
            * Agent roles, communication, and learning in debate-style conversations (Paper 2)
            * Collaboration, division of labor, and adaptation in multi-agent systems (Paper 3)
            * Memory management, game theory, and task allocation in multi-agent systems (Paper 4)
            * Language-conditioned reinforcement learning and explicit inter-agent communication (Paper 5)
            * Visual exploration frameworks for designing coordination strategies (Paper 6)
            These concepts and techniques demonstrate the potential of LLMs in enhancing the capabilities of multi-agent systems, facilitating collaboration, and improving decision-making.
            **Challenges and Limitations**
            The papers also identify several challenges and limitations in LLM-based multi-agent systems, including:
            * Aligning LLM-MA systems with operational environments and collective objectives (Paper 1)
            * Addressing issues related to fairness, diversity, and explainability in LLM-based evaluators (Paper 2)
            * Careful design and oversight to ensure ethical operation and decision-making (Paper 3)
            * Optimizing task allocation, managing complex context information, and fostering robust reasoning (Paper 4)
            * Integrating LLMs with multi-agent reinforcement learning and addressing open research problems (Paper 5)
            * Cognitive overload and flexibility in visual exploration frameworks (Paper 6)
            These challenges and limitations underscore the need for further research to address the complexities and limitations of LLM-based multi-agent systems.
            **Applications and Frameworks**
            The papers propose various applications and frameworks for LLM-based multi-agent systems, including:
            * Software development, world simulation, psychology, collaboration, embodied agents, debating, and policy making (Paper 1)
            * Text evaluation, natural language generation, human-computer interaction, and cognitive science (Paper 2)
            * Oracle agents, system designer agents, plugins, and multi-agent collaboration (Paper 3)
            * Blockchain and fraud detection (Paper 4)
            * Embodied systems, autonomous driving, and language-conditioned MARL for problem-solving and gaming (Paper 5)
            * AgentCoord system, plan outline generation, agent assignment exploration, and task process view (Paper 6)
            These applications and frameworks demonstrate the potential of LLM-based multi-agent systems to transform various domains and improve system performance.
            **Conclusion**
            In conclusion, this summary provides a comprehensive overview of the progress and challenges in LLM-based multi-agent systems. The papers collectively highlight the potential of LLMs in enabling agents to collaboratively engage in planning, discussions, and decision-making. While challenges and limitations exist, the proposed applications and frameworks demonstrate the potential of LLM-based multi-agent systems to transform various domains and improve system performance. Further research is needed to address the complexities and limitations of these systems, but the potential benefits are substantial.
        """    
    bleu_score = evaluate_output(final_summary, manual_review)
    print(f"\nScore (BLEU or Overlap): {bleu_score:.4f}")
    logging.info(f"Evaluation completed. Score: {bleu_score:.4f}")

await main()

Extracted 24 chunks from paper1.pdf
Extracted 17 chunks from paper2.pdf
Extracted 15 chunks from paper3.pdf
Extracted 12 chunks from paper4.pdf
Extracted 17 chunks from paper5.pdf
Extracted 21 chunks from paper6.pdf
Processing paper 1 with 24 chunks...
Word count: 442
Processing paper 2 with 17 chunks...
Word count: 394
Processing paper 3 with 15 chunks...
Word count: 491
Processing paper 4 with 12 chunks...
Word count: 391
Processing paper 5 with 17 chunks...
Word count: 352
Processing paper 6 with 21 chunks...
Word count: 393
Compiling into 1000-word summary...
Word count: 1382

Final 1000-Word Summary:
<think>
Okay, I need to help the user synthesize six 500-word summaries into a 1000-word summary on multi-agent systems powered by LLMs. The structure should include an introduction, key concepts, challenges, and applications. Let me start by understanding each summary.

Summary 1 introduces LLM-MA systems, their architecture, and applications. It also mentions challenges like evaluat