## Auto Gen Tutorial
Note book written by John Adeojo
Founder, and Chief Data Scientist at [Data-centric Solutions](https://www.data-centric-solutions.com/)


In [51]:
import autogen
import yaml
import openai 
import os

script_dir = "C:/Users/johna/OneDrive/Documents/api_keys/"
index_path = "G:/My Drive/Data-Centric Solutions/07. Blog Posts/AutoGen/autogen_tutorial/indexes/"
configurations_path = "G:/My Drive/Data-Centric Solutions/07. Blog Posts/AutoGen/autogen_tutorial/"

config_list = autogen.config_list_from_json(
    env_or_file="configurations.json",
    file_location=configurations_path,
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)

def get_apikey(script_dir=script_dir):

    script_dir = script_dir
    file_path = os.path.join(script_dir, "apikeys.yml")

    with open(file_path, 'r') as yamlfile:
        loaded_yamlfile = yaml.safe_load(yamlfile)
        API_KEY = loaded_yamlfile['openai']['api_key']

    return API_KEY

openai.api_key = get_apikey()

In [52]:
from typing import Any, List

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
import wikipedia

class WikipediaReader(BaseReader):
    """Wikipedia reader that reads a page and includes metadata."""

    def load_data(self, pages: List[str], lang: str = "en", **load_kwargs: Any) -> List[Document]:
        """Load data from Wikipedia.

        Args:
            pages (List[str]): List of pages to read.
            lang (str): Language of Wikipedia texts (default is English).
        """
        results = []
        for page in pages:
            wikipedia.set_lang(lang)
            wiki_page = wikipedia.page(page, **load_kwargs)
            page_content = wiki_page.content
            page_url = wiki_page.url
            # Create a Document with URL included in the metadata
            document = Document(text=page_content, metadata={'source_url': page_url})
            results.append(document)
        return results


In [53]:
# Tool 1: Does a query based search for Wikipages
import wikipedia
from llama_index import download_loader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import get_default_text_splitter
from llama_index import StorageContext
from llama_index import load_index_from_storage
import json

def load_index(filepath: str):
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=index_path)
    # load index
    return load_index_from_storage(storage_context)

def read_json_file(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data


def create_wikidocs(wikipage_requests):
    print(f"Preparing to Download:{wikipage_requests}")
    documents = []
    for page_title in wikipage_requests:
        try:
            # Attempt to load the Wikipedia page
            wiki_page = wikipedia.page(page_title)
            page_content = wiki_page.content
            page_url = wiki_page.url
            document = Document(text=page_content, metadata={'source_url': page_url})
            documents.append(document)
        except wikipedia.exceptions.PageError:
            # Handle the case where the page does not exist
            print(f"PageError: The page titled '{page_title}' does not exist on Wikipedia.")
        except wikipedia.exceptions.DisambiguationError as e:
            # Handle the case where the page title is ambiguous
            print(f"DisambiguationError: The page titled '{page_title}' is ambiguous. Possible options: {e.options}")
    print("Finished downloading pages")
    return documents


def index_wikipedia_pages(wikipage_requests):
    print(f"Preparing to index Wikipages: {wikipage_requests}")
    documents = create_wikidocs(wikipage_requests)
    text_splits = get_default_text_splitter(chunk_size=150, chunk_overlap=45)
    parser = SimpleNodeParser.from_defaults(text_splitter=text_splits)
    service_context = ServiceContext.from_defaults(node_parser=parser)
    index =  VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=False)
    index.storage_context.persist(index_path)
    print(f"{wikipage_requests} have been indexed.")
    return "indexed"

def search_and_index_wikipedia(
        hops: list, lang: str = "en", results_limit: int = 2
    ):

    # Set the language for Wikipedia
    wikipedia.set_lang(lang)

    # Initialize an empty list to hold all indexed page titles
    wikipage_requests = []

    # Loop through the identified hops and search for each
    for hop in hops:
        hop_pages = wikipedia.search(hop, results=results_limit)
        print(f"Searching Wikipedia for: {hop} - Found: {hop_pages}")
        wikipage_requests.extend(hop_pages)

    # Index the gathered pages (assuming 'index_wikipedia_pages' is a defined function that you implement)
    index_wikipedia_pages(wikipage_requests)

    return wikipage_requests


def query_wiki_index(hops: List[str], index_path: str = index_path, n_results: int = 5): 
    index = load_index(filepath=index_path)
    query_engine = index.as_query_engine(
        response_mode="compact", verbose=True, similarity_top_k=n_results
    )
    
    retrieved_context = {}
    
    # Iterate over each hop in the multihop query
    for hop in hops:
        nodes = query_engine.query(hop).source_nodes
        
        # Process each node found for the current hop
        for node in nodes:
            doc_id = node.node.id_
            doc_text = node.node.text
            doc_source = node.node.metadata.get('source_url', 'No source URL')  # Default value if source_url is not present.
            
            # Append to the list of texts and sources for each doc_id
            if doc_id not in retrieved_context:
                retrieved_context[doc_id] = {'texts': [doc_text], 'sources': [doc_source]}
            else:
                retrieved_context[doc_id]['texts'].append(doc_text)
                retrieved_context[doc_id]['sources'].append(doc_source)

    # Serialise the context for all hops into a JSON file
    file_path = index_path + "retrieved_context.json"
    with open(file_path, 'w') as f:
        json.dump(retrieved_context, f)
    
    return retrieved_context


In [54]:
# docs = create_wikidocs(["Paris"])

In [55]:
# docs

In [56]:
# test = query_wiki_index("First bank to default")
# test

In [57]:

llm_config = {
    "functions": [
        {
            "name": "search_and_index_wikipedia",
            "description": "Indexes Wikipedia pages based on specified queries for each hop to build a knowledge base for future reference. Use before query_wiki_index.",
            "parameters": {
                "type": "object",
                "properties": {
                    "hops": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The search queries for identifying relevant Wikipedia pages to index, each corresponding to a hop in the multihop question.",
                    }
                },
                "required": ["hops"],
            },
        },
        {
            "name": "query_wiki_index",
            "description": "Queries the indexed Wikipedia knowledge base to retrieve pertinent information across multiple hops",
            "parameters": {
                "type": "object",
                "properties": {
                    "hops": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The search queries to search the indexed Wikipedia knowledge base for relevant information, each corresponding to a hop in the multihop question.",
                    },
                },
                "required": ["hops"],
            },
        },
        # ...include other function definitions if needed...
    ],
    "config_list": config_list,  # Ensure that config_list is defined somewhere in your code.
    "request_timeout": 120,
    "seed": 90
}

# The llm_config_no_tools remains the same, excluding the 'functions' key.
llm_config_no_tools = {k: v for k, v in llm_config.items() if k != 'functions'}



In [58]:
import autogen 
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=5,
    # system_message= '''Use the tools available to respond to queries.''',
    # llm_config=llm_config,
    
)

assistant = autogen.AssistantAgent(
    name="assistant",
    system_message='''
    Upon receiving a query, promptly execute the `search_and_index_wikipedia` function to index the relevant Wikipedia pages. 
    After indexing is complete, utilize the `query_wiki_index` function to search through the indexed content and extract precise 
    information to answer the user's question. 
    Craft a comprehensive and precise response using this information, ensuring it adheres to the criteria of precision, depth, clarity, and proper citation.

    After drafting your response, present it followed by "PLEASE REVIEW" to notify the Moderator. 
    If the Moderator approves your response, present your final response and conclude with "TERMINATE" to signal the end of the workflow. 
    If the Moderator rejects your response, carefully review their feedback, make the necessary amendments, and resubmit the revised response with a new "PLEASE REVIEW" prompt.

    Response criteria:
    1. Precision: Directly address the user's question.
    2. Depth: Provide comprehensive information using indexed content.
    3. Citing: Include clear references from the indexed content.
    4. Clarity: Present information logically and coherently.

    ''',
    llm_config=llm_config,
    # human_input_mode="NEVER"
)

moderator = autogen.AssistantAgent(
    name="moderator",
    system_message='''
    Your role is to review the Assistant's responses for clarity, relevance, and factual accuracy. 
    Begin your assessment after the Assistant issues a "PLEASE REVIEW" signal. 
    Confirm that the Assistant's response aligns with the following criteria:

    1. Clarity: Information must be clear and understandable.
    2. Relevance: Responses must directly answer the user's question.
    3. Accuracy: Confirm that the information provided is factually correct based on the indexed sources.
    4. Citation: Verify that the Assistant has included citations from the indexed content.
    5. Coherence: Ensure the information is presented in a logical and coherent order.

    Upon review:
    - If the response meets all criteria, state "The response is approved," allowing the Assistant to proceed with the final response.
    - If the response does not meet the criteria, specify the shortcomings and instruct the Assistant to amend the response. 
    Do not generate new content; guide the Assistant to improve their initial response.

    Remember, your role is strictly to review and provide feedback. 
    The Assistant is responsible for generating and revising the responses to the original query. 

    ''',
    llm_config=llm_config_no_tools,
    # human_input_mode="NEVER"
)

user_proxy.register_function(
    function_map={
        "search_and_index_wikipedia": search_and_index_wikipedia,
        "query_wiki_index":query_wiki_index,
        # "generate_response":generate_response
    }
)

groupchat = autogen.GroupChat(agents=[user_proxy, assistant, moderator], messages=[], max_round=20)
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)
user_proxy.initiate_chat(
    manager, 
    message='''What data did silvergate bank fail and why?'''
                         )

[33muser_proxy[0m (to chat_manager):

What data did silvergate bank fail and why?

--------------------------------------------------------------------------------
[33massistant[0m (to chat_manager):

[32m***** Suggested function Call: search_and_index_wikipedia *****[0m
Arguments: 
{
  "hops": ["Silvergate_Bank"]
}
[32m***************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION search_and_index_wikipedia...[0m
Searching Wikipedia for: Silvergate_Bank - Found: ['Silvergate Bank', '2023 United States banking crisis']
Preparing to index Wikipages: ['Silvergate Bank', '2023 United States banking crisis']
Preparing to Download:['Silvergate Bank', '2023 United States banking crisis']
PageError: The page titled 'Silvergate Bank' does not exist on Wikipedia.
Finished downloading pages
['Silvergate Bank', '2023 United States banking crisis'] have been indexed.
[3