## Auto Gen Tutorial
Note book written by John Adeojo
Founder, and Chief Data Scientist at [Data-centric Solutions](https://www.data-centric-solutions.com/)


In [48]:
import autogen
import yaml
import openai 
import os

script_dir = "C:/Users/johna/OneDrive/Documents/api_keys/"
index_path = "G:/My Drive/Data-Centric Solutions/07. Blog Posts/AutoGen/autogen_tutorial/indexes/"
configurations_path = "G:/My Drive/Data-Centric Solutions/07. Blog Posts/AutoGen/autogen_tutorial/"

config_list = autogen.config_list_from_json(
    env_or_file="configurations.json",
    file_location=configurations_path,
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)

def get_apikey(script_dir=script_dir):

    script_dir = script_dir
    file_path = os.path.join(script_dir, "apikeys.yml")

    with open(file_path, 'r') as yamlfile:
        loaded_yamlfile = yaml.safe_load(yamlfile)
        API_KEY = loaded_yamlfile['openai']['api_key']

    return API_KEY

openai.api_key = get_apikey()

In [49]:
from typing import Any, List

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
import wikipedia

class WikipediaReader(BaseReader):
    """Wikipedia reader that reads a page and includes metadata."""

    def load_data(self, pages: List[str], lang: str = "en", **load_kwargs: Any) -> List[Document]:
        """Load data from Wikipedia.

        Args:
            pages (List[str]): List of pages to read.
            lang (str): Language of Wikipedia texts (default is English).
        """
        results = []
        for page in pages:
            wikipedia.set_lang(lang)
            wiki_page = wikipedia.page(page, **load_kwargs)
            page_content = wiki_page.content
            page_url = wiki_page.url
            # Create a Document with URL included in the metadata
            document = Document(text=page_content, metadata={'source_url': page_url})
            results.append(document)
        return results


In [50]:
# Tool 1: Does a query based search for Wikipages
import wikipedia
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
from llama_index import StorageContext
from llama_index import load_index_from_storage
import json

def load_index(filepath: str):
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=index_path)
    # load index
    return load_index_from_storage(storage_context)

def read_json_file(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data


def create_wikidocs(wikipage_requests):
    print(f"Preparing to Download:{wikipage_requests}")
    # WikipediaReader = download_loader("WikipediaReader")
    loader = WikipediaReader()
    documents = loader.load_data(pages=wikipage_requests)
    print("Finished downloading pages")
    return documents

def index_wikipedia_pages(wikipage_requests):
    print(f"Preparing to index Wikipages: {wikipage_requests}")
    documents = create_wikidocs(wikipage_requests)
    text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
    parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
    service_context = ServiceContext.from_defaults(node_parser=parser)
    index =  VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=False)
    index.storage_context.persist(index_path)
    print(f"{wikipage_requests} have been indexed.")
    return "indexed"

def search_and_index_wikipedia(
        hops: list, lang: str = "en", results_limit: int = 2
    ):
    """
    Searches for and indexes Wikipedia pages relevant to each hop of a multihop question.

    Args:
        hops (list): A list of strings, each representing a hop in the multihop question.
        lang (str, optional): The language edition of Wikipedia to search. Defaults to "en".
        results_limit (int, optional): The limit for the number of results to return per hop. Defaults to 2.

    Returns:
        list: A list of Wikipedia page titles that have been indexed.
    """

    # Set the language for Wikipedia
    wikipedia.set_lang(lang)

    # Initialize an empty list to hold all indexed page titles
    indexed_pages = []

    # Loop through the identified hops and search for each
    for hop in hops:
        hop_pages = wikipedia.search(hop, results=results_limit)
        print(f"Searching Wikipedia for: {hop} - Found: {hop_pages}")
        indexed_pages.extend(hop_pages)

    # Index the gathered pages (assuming 'index_wikipedia_pages' is a defined function that you implement)
    index_wikipedia_pages(indexed_pages)

    return indexed_pages

# def search_and_index_wikipedia(
#         query: str, lang: str = "en", results_limit: int = 2
#     ):
#     wikipedia.set_lang(lang)
#     wikipage_requests = wikipedia.search(query, results=results_limit)
#     print (f"I have found on Wikipedia: {wikipage_requests}")
#     index_wikipedia_pages(wikipage_requests)
#     return f"Finished indexing: {type(wikipage_requests)}"

import json

# def query_wiki_index(search_string: str, index_path: str =index_path, n_results: int = 10): 
#     index = load_index(filepath=index_path)
#     query_engine = index.as_query_engine(
#         response_mode="compact", verbose=True, similarity_top_k=n_results
#     )
#     nodes = query_engine.query(search_string).source_nodes

#     retrieved_context = {}
#     for node in nodes:
#         doc_id = node.node.id_
#         doc_text = node.node.text
#         doc_source = node.node.metadata.get('source_url', 'No source URL')  # Default value if source_url is not present.
#         retrieved_context[doc_id] = {'text': doc_text, 'source': doc_source}

#     file_path = index_path + "retrieved_context.json"
#     with open(file_path, 'w') as f:
#         json.dump(retrieved_context, f)
    
#     return retrieved_context

# def query_wiki_index(hops: List[str], index_path: str = index_path, n_results: int = 20): 
#     index = load_index(filepath=index_path)
#     query_engine = index.as_query_engine(
#         response_mode="compact", verbose=True, similarity_top_k=n_results
#     )
    
#     retrieved_context = {}
    
#     # Iterate over each hop in the multihop query
#     for hop in hops:
#         nodes = query_engine.query(hop).source_nodes

#         for node in nodes:
#             doc_id = node.node.id_
#             doc_text = node.node.text
#             doc_source = node.node.metadata.get('source_url', 'No source URL')  # Default value if source_url is not present.
            
#             # Instead of overwriting, append to the list of texts and sources for each doc_id
#             if doc_id not in retrieved_context:
#                 retrieved_context[doc_id] = {'texts': [], 'sources': []}
            
#             retrieved_context[doc_id]['texts'].append(doc_text)
#             retrieved_context[doc_id]['sources'].append(doc_source)

#     # Serialise the context for all hops into a JSON file
#     file_path = index_path + "retrieved_context.json"
#     with open(file_path, 'w') as f:
#         json.dump(retrieved_context, f)
    
#     return retrieved_context

def query_wiki_index(hops: List[str], index_path: str = index_path, n_results: int = 10): 
    index = load_index(filepath=index_path)
    query_engine = index.as_query_engine(
        response_mode="compact", verbose=True, similarity_top_k=n_results
    )
    
    retrieved_context = {}
    
    # Iterate over each hop in the multihop query
    for hop in hops:
        nodes = query_engine.query(hop).source_nodes
        
        # Process each node found for the current hop
        for node in nodes:
            doc_id = node.node.id_
            doc_text = node.node.text
            doc_source = node.node.metadata.get('source_url', 'No source URL')  # Default value if source_url is not present.
            
            # Append to the list of texts and sources for each doc_id
            if doc_id not in retrieved_context:
                retrieved_context[doc_id] = {'texts': [doc_text], 'sources': [doc_source]}
            else:
                retrieved_context[doc_id]['texts'].append(doc_text)
                retrieved_context[doc_id]['sources'].append(doc_source)

    # Serialise the context for all hops into a JSON file
    file_path = index_path + "retrieved_context.json"
    with open(file_path, 'w') as f:
        json.dump(retrieved_context, f)
    
    return retrieved_context




In [51]:
# docs = create_wikidocs(["Paris"])

In [52]:
# docs

In [53]:
# test = query_wiki_index("First bank to default")
# test

In [54]:
# llm_config = {
#     "functions": [
#         {
#             "name": "search_and_index_wikipedia",
#             "description": "Indexes Wikipedia pages based on a specified query to build a knowledge base for future reference. Use before query_wiki_index.",
#             "parameters": {
#                 "type": "object",
#                 "properties": {
#                     "query": {
#                         "type": "string",
#                         "description": "The search query for identifying relevant Wikipedia pages to index.",
#                     }
#                 },
#                 "required": ["query"],
#             },
#         },
#         {
#             "name": "query_wiki_index",
#             "description": "Queries the indexed Wikipedia knowledge base to retrieve pertinent information",
#             "parameters": {
#                 "type": "object",
#                 "properties": {
#                     "search_string": {
#                         "type": "string",
#                         "description": "The query to search the indexed Wikipedia knowledge base for relevant information.",
#                     }
#                 },
#                 "required": ["search_string"],
#             },
#         },
#     ],
#     "config_list": config_list,
#     "request_timeout": 120,
#     "seed":70
# }

# llm_config_no_tools = {k: v for k, v in llm_config.items() if k != 'functions'}

# llm_config = {
#     "functions": [
#         {
#             "name": "search_and_index_wikipedia",
#             "description": "Indexes Wikipedia pages based on specified queries for each hop to build a knowledge base for future reference. Use before query_wiki_index.",
#             "parameters": {
#                 "type": "object",
#                 "properties": {
#                     "hops": {
#                         "type": "array",
#                         "items": {
#                             "type": "string"
#                         },
#                         "description": "The search queries for identifying relevant Wikipedia pages to index, each corresponding to a hop in the multihop question.",
#                     }
#                 },
#                 "required": ["hops"],
#             },
#         },
#         {
#             "name": "query_wiki_index",
#             "description": "Queries the indexed Wikipedia knowledge base to retrieve pertinent information",
#             "parameters": {
#                 "type": "object",
#                 "properties": {
#                     "search_string": {
#                         "type": "string",
#                         "description": "The query to search the indexed Wikipedia knowledge base for relevant information.",
#                     }
#                 },
#                 "required": ["search_string"],
#             },
#         },
#         # ...include other function definitions if needed...
#     ],
#     "config_list": config_list,  # Ensure that config_list is defined somewhere in your code.
#     "request_timeout": 120,
#     "seed": 77
# }

# # The llm_config_no_tools remains the same, excluding the 'functions' key.
# llm_config_no_tools = {k: v for k, v in llm_config.items() if k != 'functions'}

llm_config = {
    "functions": [
        {
            "name": "search_and_index_wikipedia",
            "description": "Indexes Wikipedia pages based on specified queries for each hop to build a knowledge base for future reference. Use before query_wiki_index.",
            "parameters": {
                "type": "object",
                "properties": {
                    "hops": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The search queries for identifying relevant Wikipedia pages to index, each corresponding to a hop in the multihop question.",
                    }
                },
                "required": ["hops"],
            },
        },
        {
            "name": "query_wiki_index",
            "description": "Queries the indexed Wikipedia knowledge base to retrieve pertinent information across multiple hops",
            "parameters": {
                "type": "object",
                "properties": {
                    "hops": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The search queries to search the indexed Wikipedia knowledge base for relevant information, each corresponding to a hop in the multihop question.",
                    },
                },
                "required": ["hops"],
            },
        },
        # ...include other function definitions if needed...
    ],
    "config_list": config_list,  # Ensure that config_list is defined somewhere in your code.
    "request_timeout": 120,
    "seed": 77
}

# The llm_config_no_tools remains the same, excluding the 'functions' key.
llm_config_no_tools = {k: v for k, v in llm_config.items() if k != 'functions'}



In [55]:
import autogen 
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=5,
    # system_message= '''Use the tools available to respond to queries.''',
    # llm_config=llm_config,
    
)

assistant = autogen.AssistantAgent(
    name="assistant",
    system_message='''
    Begin by indexing the necessary Wikipedia pages using the `search_and_index_wikipedia` function,  
    Once the pages have been indexed, employ the `query_wiki_index` function to search through the indexed knowledge base. 
    Craft a search string that targets the specific information requested by the user. 
    This may involve a complex query that correlates the information from different indexed pages. 
    Present the retrieved information in a clear and concise manner, ensuring the final response is accurate and concludes with TERMINATE.
    ''',
    llm_config=llm_config,
    # human_input_mode="NEVER"
)

moderator = autogen.AssistantAgent(
    name="moderator",
    system_message='''
    Scrutinize the Assistant's provided data with a fine-toothed comb to validate its currentness and relevance. 
    Verify the sources are genuine and assess whether the information is presented in a manner that is comprehensive and easily understood by the user. 
    If any discrepancies or gaps in data are detected, prompt the Assistant to refine its search or to extract more detailed information. 
    The end goal is to ensure the user is furnished with precise, up-to-date information that can withstand scrutiny. 
    Confirm the sources and provide a summary that encapsulates the crux of the query, linking back to the sources for further reading. 
    Only once all information is confirmed to be accurate and complete should you conclude with TERMINATE.
    ''',
    llm_config=llm_config_no_tools,
    # human_input_mode="NEVER"
)

user_proxy.register_function(
    function_map={
        "search_and_index_wikipedia": search_and_index_wikipedia,
        "query_wiki_index":query_wiki_index,
        # "generate_response":generate_response
    }
)

groupchat = autogen.GroupChat(agents=[user_proxy, assistant, moderator], messages=[], max_round=20)
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)
user_proxy.initiate_chat(manager, message="Which author, born in the city that hosted the 1992 Summer Olympics, won the Nobel Prize in Literature in the same year as the first space shuttle flight?")

[33muser_proxy[0m (to chat_manager):

Which author, born in the city that hosted the 1992 Summer Olympics, won the Nobel Prize in Literature in the same year as the first space shuttle flight?

--------------------------------------------------------------------------------
[33massistant[0m (to chat_manager):

[32m***** Suggested function Call: search_and_index_wikipedia *****[0m
Arguments: 
{
"hops": [
"1992 Summer Olympics",
"List of Nobel laureates in Literature",
"Space Shuttle"
]
}
[32m***************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION search_and_index_wikipedia...[0m
Searching Wikipedia for: 1992 Summer Olympics - Found: ['1992 Summer Olympics', 'Bhutan at the 1992 Summer Olympics']
Searching Wikipedia for: List of Nobel laureates in Literature - Found: ['List of Nobel laureates in Literature', 'List of Muslim Nobel laureates']
Searching Wik

KeyboardInterrupt: 

In [None]:
# assistant = autogen.AssistantAgent(
#     name="assistant",
#     system_message='''
#     Complete the task using the information provided by the UserProxyAgent. 
#     Utilize the `search_and_index_wikipedia` function to index the relevant Wikipedia Knowledge Base, 
#     followed by the `query_wiki_index` function to retrieve pertinent information to aid in task completion. 
#     Deliver the final response and conclude with TERMINATE.
#     ''',
#     llm_config=llm_config,
#     # human_input_mode="NEVER"
# )

# moderator = autogen.AssistantAgent(
#     name="moderator",
#     system_message='''
#     Review the data returned from the executed functions. Confirm that the information provided is relevant 
#     to the user's query, accurate according to the source, and represents the most current data available. 
#     If the data is outdated, incomplete, or ambiguous, identify these issues and recommend that 
#     the `assistant` seek further verification or more recent sources. 
#     The goal is to ensure that the user receives information that is not only accurate but 
#     also contextually appropriate for their request.
#     Deliver the final response and conclude with TERMINATE.
#     ''',
#     llm_config=llm_config_no_tools,
#     # human_input_mode="NEVER"
# )