# Nanonets Langchain Guide

Refer to the blog and follow along the tutorial section for easy comprehension - https://nanonets.com/blog/llamaindex

In [1]:
import os
import openai 

os.environ["OPENAI_API_KEY"] = "your_api_key_here"
openai.api_key = "your_api_key_here"

In [2]:
from llama_index import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_files=["bcg-2022-annual-sustainability-report-apr-2023.pdf"]
)

pdf_documents = reader.load_data()

In [3]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()
wikipedia_documents = loader.load_data(pages=['Iceland Country', 'Kenya Country', 'Cambodia Country'])

In [4]:
from llama_index.node_parser import SimpleNodeParser

parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)

pdf_nodes = parser.get_nodes_from_documents(pdf_documents)
wikipedia_nodes = parser.get_nodes_from_documents(wikipedia_documents)

In [7]:
from llama_index import VectorStoreIndex
index = VectorStoreIndex(pdf_nodes)

In [8]:
from llama_index.llms import OpenAI
from llama_index import ServiceContext, OpenAIEmbedding
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.response_synthesizers import ResponseMode, get_response_synthesizer

chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)

doc_summary_index = DocumentSummaryIndex(wikipedia_nodes)

current doc id: e448f564-cd99-4b5b-91a6-b0da581a2ca1
current doc id: ebcc0033-013c-4848-a58f-ea33a61d5034
current doc id: 325e14fa-8bfa-4164-8393-d28d699a3dae


In [9]:
index.storage_context.persist(persist_dir="BCG Report")

In [10]:
query = 'in what context is Morocco mentioned in the report?'

query_engine = index.as_query_engine()
response = query_engine.query(query)
print(response)


Morocco is mentioned in the report in the context of the government's efforts to expand the social safety net and improve health access. BCG's teams provided support to integrate a portion of Morocco's most vulnerable citizens into the universal health care scheme, which was completed in a matter of months. As of December 1, 2022, more than 90% of Morocco's people have access to universal health care, up from 42% just months before. BCG's team also worked with the government to model scenarios for expanding child support to vulnerable families, assessing options to extend the country's pension scheme and unemployment benefits, and instituting other reforms.


In [11]:
response = query_engine.query('List measures taken to address diseases occuring in developing industries')
print(response)


1. Providing access to innovative medicines for people living in lower-income countries
2. Optimizing the supply chain for the long term
3. Deploying a global health team to the region
4. Leveraging existing registrations as much as possible
5. Conducting a regulatory analysis to initiate drug approval processes on time
6. Translating the strategy into tangible functional and cross-functional plans


In [12]:
import nest_asyncio
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index import ServiceContext

nest_asyncio.apply()

# We are using the LlamaDebugHandler to print the trace of the sub questions captured by the SUB_QUESTION callback event type
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

service_context = ServiceContext.from_defaults(
    callback_manager=callback_manager
)

In [13]:
vector_query_engine = VectorStoreIndex.from_documents(
    wikipedia_documents, use_async=True, service_context=service_context
).as_query_engine()

**********
Trace: index_construction
    |_node_parsing ->  1.678123 seconds
      |_chunking ->  0.568942 seconds
      |_chunking ->  0.470651 seconds
      |_chunking ->  0.63413 seconds
    |_embedding ->  4.125725 seconds
    |_embedding ->  4.095924 seconds
**********


In [14]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=vector_query_engine,
        metadata=ToolMetadata(
            name="countries",
            description="Wikipedia pages about the countries - Iceland, Kenya, Cambodia.",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

In [15]:
response = query_engine.query(
    "Give me all similaries between Iceland, Kenya and Cambodia"
)

Generated 3 sub questions.
[36;1m[1;3m[countries] Q: What are the similarities between Iceland and Kenya
[0m[33;1m[1;3m[countries] Q: What are the similarities between Iceland and Cambodia
[0m[38;5;200m[1;3m[countries] Q: What are the similarities between Kenya and Cambodia
[0m[38;5;200m[1;3m[countries] A: 
Both Kenya and Cambodia have a rich cultural heritage, with popular music and art figures. Both countries have a staple grain of rice in their cuisine, and both have distinct local street foods. Both countries have a strong tea culture, and both have a variety of industrial and microbreweries. Both countries have armed forces that are regularly deployed in peacekeeping missions, and both have 47 semi-autonomous counties. Finally, both countries have laws that criminalize homosexual acts and have a majority of citizens who do not accept homosexuality.
[0m[36;1m[1;3m[countries] A: 
Both Iceland and Kenya have a strong sense of community and lack of social isolation, with

In [16]:
print(response)


All three countries have a strong sense of community and lack of social isolation, with high levels of social cohesion attributed to the small size and homogeneity of the population. All three countries also have a strong work ethic, with Icelanders working some of the longest hours of any industrialised nation, Kenyans known for their hardworking nature, and Cambodians having a reputation for hard work and resilience. Additionally, all three countries have high levels of gender equality, with Iceland consistently ranked among the top three countries in the world for women to live in, Kenya having made significant progress in recent years in terms of women's rights, and Cambodia having made significant progress in closing the gender gap in recent years. Finally, all three countries have a liberal attitude towards LGBT rights, with Iceland having legalised same-sex marriages in 2010, Kenya having decriminalised same-sex relationships in 2019, and Cambodia having made significant progre

In [17]:
from typing import List
from pydantic import BaseModel


class CountryInfo(BaseModel):
    """Data model for getting structured data about countries"""

    name: str
    official_languages: List[str]
    neighbouring_counties: List[str]
    form_of_government: str
    size_in_square_kilometers: str

In [18]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.7)
service_context = ServiceContext.from_defaults(llm=llm)

index = VectorStoreIndex.from_documents(
    wikipedia_documents, service_context=service_context
)

In [19]:
query_engine = index.as_query_engine(
    output_cls=CountryInfo, response_mode="compact"
)

In [23]:
prefix = 'Give me info about this country - '
suffix = '''On the following points - 
name
official_languages
neighbouring_counties
form_of_government
size_in_square_kilometers
'''

responses = {}

for country in ['Iceland', 'Kenya', 'Cambodia']:
    responses[country] = query_engine.query(prefix + country + suffix)

In [24]:
print(responses['Iceland'])

Name: Iceland
Official Languages: Icelandic
Neighboring Countries: None (located between the North Atlantic and Arctic Oceans)
Form of Government: Republic
Size in Square Kilometers: 103,000 km2


In [25]:
print(responses['Cambodia'])

Name: Cambodia
Official Languages: Khmer
Neighboring Countries: Thailand, Laos, Vietnam
Form of Government: Constitutional monarchy
Size in Square Kilometers: 181,035 square kilometers


In [26]:
from llama_index.output_parsers import GuardrailsOutputParser
from llama_index.llm_predictor import StructuredLLMPredictor

In [27]:
llm_predictor = StructuredLLMPredictor()

from llama_index.prompts import Prompt
from llama_index.prompts.default_prompts import (
    DEFAULT_TEXT_QA_PROMPT_TMPL,
    DEFAULT_REFINE_PROMPT_TMPL,
)

rail_spec = """
<rail version="0.1">

<output>
    <list name="points" description="Bullet points regarding measures taken to address a problem">
        <object>
            <string name="explanation" format="one-line" on-fail-one-line="noop" />
            <string name="explanation2" format="one-line" on-fail-one-line="noop" />
            <string name="explanation3" format="one-line" on-fail-one-line="noop" />
        </object>
    </list>
</output>

<prompt>

Query string here.

@xml_prefix_prompt

{output_schema}

@json_suffix_prompt_v2_wo_none
</prompt>
</rail>
"""

In [28]:
from llama_index.output_parsers import LangchainOutputParser
from llama_index.llm_predictor import StructuredLLMPredictor
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [29]:
llm_predictor = StructuredLLMPredictor()

from llama_index.prompts import Prompt
from llama_index.prompts.default_prompts import (
    DEFAULT_TEXT_QA_PROMPT_TMPL,
    DEFAULT_REFINE_PROMPT_TMPL,
)

response_schemas = [
    ResponseSchema(
        name="Time",
        description=(
            "Time of occurence of event"
        ),
    ),
    ResponseSchema(
        name="Place",
        description="Place of occurence of event",
    ),
    ResponseSchema(
        name="Description",
        description="Event Description",
    )
]

In [30]:
lc_output_parser = StructuredOutputParser.from_response_schemas(
    response_schemas
)
output_parser = LangchainOutputParser(lc_output_parser)

In [31]:
fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

qa_prompt = Prompt(fmt_qa_tmpl, output_parser=output_parser)
refine_prompt = Prompt(fmt_refine_tmpl, output_parser=output_parser)

In [32]:
print(fmt_qa_tmpl)

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the question: {query_str}


The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{{
	"Time": string  // Time of occurence of event
	"Place": string  // Place of occurence of event
	"Description": string  // Event Description
}}
```


In [33]:
query_engine = index.as_query_engine(
    text_qa_template=qa_prompt,
    refine_template=refine_prompt,
    llm_predictor=llm_predictor,
)

In [34]:
response = query_engine.query(
    "Who was the first president of Iceland?",
)

print(response)

```json
{
	"Time": "17 June 1944",
	"Place": "Iceland",
	"Description": "Sveinn Björnsson became the first president of Iceland."
}
```


In [38]:
response = query_engine.query(
    "Describe the most important event in the history of Cambodia in the 21st century."
)

print(response)

```json
{
	"Time": "2018",
	"Place": "Cambodia",
	"Description": "The ruling Cambodian People's Party enacted tighter curbs on mass media and dissolved the opposition party Cambodia National Rescue Party ahead of the 2018 Cambodian general election. The CPP won every seat in the National Assembly without major opposition, effectively solidifying de facto one-party rule in the country."
}
```


In [39]:
index = VectorStoreIndex(pdf_nodes)

chat_engine = index.as_chat_engine()
response = chat_engine.chat("is Morocco mentioned in the report?")

print(response)

Yes, Morocco is mentioned in the report.


In [40]:
response = chat_engine.chat("can you tell me the context in which it is mentioned?")

print(response)

Morocco is mentioned in the report in the context of a social reform project to expand the social safety net and improve health access. BCG teams provided support to integrate a portion of Morocco’s most vulnerable citizens into the universal health care scheme, which was completed in a matter of months. As of December 1, 2022, more than 90% of Morocco’s people have access to universal health care, up from 42% just months before. This has enabled millions of vulnerable families to benefit from significant health access improvements.


In [41]:
response = chat_engine.chat("what statistic best conveys the positive impact here?")

print(response)

90% of Morocco's population now have access to universal health care.


In [5]:
import openai
openai.api_key = "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"

from llama_index.agent import OpenAIAgent
from llama_hub.tools.code_interpreter.base import CodeInterpreterToolSpec

code_spec = CodeInterpreterToolSpec()

tools = code_spec.to_tool_list()
agent = OpenAIAgent.from_tools(tools, verbose=True)

In [6]:
print(
    agent.chat(
        "Can you help me write some python code to pass to the code_interpreter tool"
    )
)

Of course! I'd be happy to help you write some Python code. What specific task or problem are you trying to solve?


In [8]:
print(
    agent.chat(
        """There is a spotify.csv file in the current directory (relative path).
                 Can you write and execute code to tell me columns does it have?"""
    )
)

=== Calling Function ===
Calling function: code_interpreter with args: {
  "code": "import pandas as pd\n\n# Read the Spotify file\nspotify_data = pd.read_csv('spotify.csv')\n\n# Get the column names\ncolumns = spotify_data.columns.tolist()\n\ncolumns"
}
Got output: StdOut:
b''
StdErr:
b''
It seems that there was no output or error returned from executing the code. This could mean that the file 'spotify.csv' does not exist in the current directory or there might be an issue with the code. 

Please make sure that the 'spotify.csv' file is in the correct location and try again. If the issue persists, please let me know and I'll be happy to assist you further.


In [232]:
print(agent.chat("Can you plot the loudness vs speechiness graph and save it in a output.png file?"))

=== Calling Function ===
Calling function: code_interpreter with args: {
  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Read the CSV file\ndf = pd.read_csv('spotify.csv')\n\n# Plot the loudness vs speechiness graph\nplt.scatter(df['loudness'], df['speechiness'])\nplt.xlabel('Loudness')\nplt.ylabel('Speechiness')\nplt.title('Loudness vs Speechiness')\n\n# Save the plot to a file\nplt.savefig('output.png')\nplt.close()"
}
Got output: StdOut:
b''
StdErr:
b''
I have plotted the Loudness vs Speechiness graph and saved it as "output.png" in the current directory. You can download the image file [here](sandbox:/output.png).

Let me know if there's anything else I can assist you with!


In [234]:
print(agent.chat("Can you give top 5 artists with at least 10 songs with highest average energy?"))

=== Calling Function ===
Calling function: code_interpreter with args: {
  "code": "import pandas as pd\n\n# Read the CSV file\ndf = pd.read_csv('spotify.csv')\n\n# Group by artist and count the number of songs\nartist_counts = df.groupby('artist').size()\n\n# Filter artists with at least 10 songs\nartists_with_10_songs = artist_counts[artist_counts >= 10]\n\n# Filter the dataframe for artists with at least 10 songs\ndf_filtered = df[df['artist'].isin(artists_with_10_songs.index)]\n\n# Calculate the average energy for each artist\naverage_energy = df_filtered.groupby('artist')['energy'].mean()\n\n# Sort the artists by average energy in descending order\ntop_5_artists = average_energy.nlargest(5)\n\n# Print the top 5 artists\nprint(top_5_artists)"
}
Got output: StdOut:
b'artist\nWALK THE MOON      0.819200\nDisclosure         0.777750\nRick Ross          0.754769\nBackstreet Boys    0.736000\nDrake              0.564750\nName: energy, dtype: float64\n'
StdErr:
b''
The top 5 artists with

In [None]:
from llama_index import download_loader
import openai
import os
openai.api_key = "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"

HubspotReader = download_loader('HubspotReader')

reader = HubspotReader("your_api_key_here")
documents = reader.load_data()

In [None]:
from llama_index.llms import OpenAI
from llama_index import ServiceContext, OpenAIEmbedding
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.response_synthesizers import ResponseMode, get_response_synthesizer

chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)

doc_summary_index = DocumentSummaryIndex.from_documents(documents=documents)

In [None]:
from typing import List
from pydantic import BaseModel


class HubspotLeadsOneDayEmail(BaseModel):
    """Data model for leads eligible for one day follow up emails."""

    lead_emails: List[str]

In [None]:
query_engine = index.as_query_engine(
    output_cls=HubspotLeadsOneDayEmail, response_mode="compact"
)

response = query_engine.query("Give me list of email addresses of leads created yesterday")

In [None]:
import openai

openai.api_key = "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"
from llama_index.agent import OpenAIAgent

from llama_hub.tools.gmail.base import GmailToolSpec
tool_spec = GmailToolSpec()

agent = OpenAIAgent.from_tools(tool_spec.to_tool_list(), verbose=True)

print(agent.chat(
    """I want to write follow up emails to leads who registered on Nanonets yesterday. 
    For each of these leads, write an email draft elaborating how their specific company can benefit from Nanonets OCR - """ + response
))