# Automatic MetaData Extraction

In [1]:
!pip install llama_index

Collecting llama_index
  Downloading llama_index-0.9.39-py3-none-any.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama_index)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama_index)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting httpx (from llama_index)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.1.0 (from llama_index)
  Downloading openai-1.10.0-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken>=0.3.3 (from llama_index)
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2

#setup

In [2]:
import nest_asyncio
nest_asyncio.apply()
import os
os.environ["OPENAI_API_KEY"]="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Define MetaData Extractor

In [3]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode

In [4]:
llm=OpenAI(temperature = 0.1, model="gpt-3.5-turbo",max_tokens=128)

# Here we define Metadata Extractors

* QuestionAnsweredExtractor
* SummaryExtractor

In [5]:
from llama_index.node_parser import TokenTextSplitter
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

In [8]:
node_parser=TokenTextSplitter(separator=" ", chunk_size=64, chunk_overlap=32)

In [9]:
question_answer_extractor= QuestionsAnsweredExtractor(questions=2,llm=llm,metadata_mode=MetadataMode.EMBED)

In [10]:
summary_extractor=SummaryExtractor(summaries=["prev","self","next"],llm=llm)

# Load in data and Run extractors

In [11]:
from llama_index import download_loader
SimpleWebPageReader=download_loader("SimpleWebPageReader")
reader=SimpleWebPageReader(html_to_text=True)
documents=reader.load_data(urls=["https://www.cricbuzz.com/"])

In [12]:
documents[0].metadata

{}

In [14]:
documents[0].get_content()

'[](https://plus.google.com/104502282508811467249)[](Javascript:void\\(0\\))[✖](Javascript:void\\(0\\))\n\n[![Cricbuzz Logo](https://static.cricbuzz.com/images/cb_logo.svg)](/)[Live\nScores](/cricket-match/live-scores "Live Cricket Score")[Schedule](/cricket-\nschedule/upcoming-series/international "Cricket Schedule")[Archives](/cricket-\nscorecard-archives "Cricket Scorecard Archives")\n\n[News](/cricket-news)[All Stories](/cricket-news "Latest Cricket News")\n[Cricbuzz Plus](/cricket-news/editorial/cb-plus "Cricbuzz Plus Premium\nArticles")[Latest News](/cricket-news/latest-news "Latest Cricket\nNews")[Topics](/cricket-news/info/ "Latest Cricket\nTopics")[Spotlight](/cricket-news/editorial/spotlight "Cricket Editorials and\nSpecials")[Opinions](/cricket-news/editorial/editorial-list "Latest Cricket\nOpinions & Editorials")[Specials](/cricket-news/editorial/specials "Latest\nCricket Specials")[Stats & Analysis](/cricket-news/editorial/stats-analysis\n"Latest Cricket Stats & Analysis")

In [19]:
orig_nodes=node_parser.get_nodes_from_documents(documents)
len(orig_nodes)

355

In [26]:
nodes=orig_nodes[20:22]

In [27]:
nodes[1].get_content(metadata_mode="all")

'"Ranji Trophy 2023-24") [All Series\n»](/cricket-schedule/series)\n\n[Teams](/cricket-team)\n\n#### Test Teams\n\n[India](/cricket-team/india/2 "India Cricket Team")'

In [29]:
nodes[0].get_content(metadata_mode="all")

'"Super Smash 2023-24") [Ranji Trophy 2023-24](/cricket-\nseries/6725/ranji-trophy-2023-24 "Ranji Trophy 2023-24") [All'

# Run Metadata extractors

In [30]:
nodes_1=summary_extractor(nodes)
nodes_1=question_answer_extractor(nodes_1)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.86s/it][A
100%|██████████| 2/2 [00:21<00:00, 10.93s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:21<00:21, 21.58s/it][A
100%|██████████| 2/2 [00:41<00:00, 20.78s/it]


# Visualize some sample data

In [31]:
nodes_1[1].get_content(metadata_mode="all")

'[Excerpt from document]\nprev_section_summary: The section is discussing two cricket tournaments, namely the Super Smash 2023-24 and the Ranji Trophy 2023-24. The key topics include these tournaments and their respective years. The key entities mentioned are the Super Smash and the Ranji Trophy.\nsection_summary: The section is about the Ranji Trophy 2023-24 cricket tournament. It mentions the teams participating in the tournament, with a specific focus on the Test teams. The only Test team mentioned in the section is India.\nquestions_this_excerpt_can_answer: 1. Which cricket tournament is the section discussing?\n2. Which Test team is mentioned in the section about the Ranji Trophy 2023-24 cricket tournament?\nExcerpt:\n-----\n"Ranji Trophy 2023-24") [All Series\n»](/cricket-schedule/series)\n\n[Teams](/cricket-team)\n\n#### Test Teams\n\n[India](/cricket-team/india/2 "India Cricket Team")\n-----'

# Setup Rag Query Engine

In [32]:
from llama_index import VectorStoreIndex
from llama_index.response.notebook_utils import (
    display_source_node,
    display_response,
)

In [33]:
index=VectorStoreIndex(orig_nodes[:20] + nodes_1 + orig_nodes[22:])

In [34]:
query_engine=index.as_query_engine()

# querying

In [43]:
query_str = (
    "Which cricket tournament is the section discussing"
    "Which Test team is mentioned in the section about the Ranji Trophy 2023-24 cricket tournament"
)
response=query_engine.query(query_str)

# Response

In [44]:
display_response(response,source_length=500)

**`Final Response:`** The section is discussing the Ranji Trophy 2023-24 cricket tournament. The Test team mentioned in the section about the Ranji Trophy 2023-24 cricket tournament is India.