# Information Extraction

1. Load and split
2. Vectorsearch (retrieval)

TODO:
* paper for extraction

In [14]:
import phoenix as px
session = px.launch_app()
px.active_session().view()

from phoenix.trace.langchain import OpenInferenceTracer, LangChainInstrumentor
tracer = OpenInferenceTracer()
LangChainInstrumentor(tracer).instrument()

🌍 To view the Phoenix app in your browser, visit http://127.0.0.1:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
📺 Opening a view to the Phoenix app. The app is running at http://127.0.0.1:6006/


# Loader

## Normal HTML reader

In [1]:
%run loader.ipynb

In [2]:
html_header_splits = load_and_split(**TEST_URLS["goldman"])
[h.metadata for h in html_header_splits]

Fetching pages: 100%|########################################################################################| 1/1 [00:00<00:00,  2.65it/s]


132028


[{},
 {'Header 2': 'Executive summary'},
 {'Header 2': 'Introduction'},
 {'Header 2': 'AI and geopolitics'},
 {'Header 2': 'The AI incumbents: The US & China'},
 {'Header 2': 'The AI capabilities of geopolitical swing states'},
 {'Header 2': 'The European Union'},
 {'Header 2': 'The United Kingdom'},
 {'Header 2': 'The United Arab Emirates'},
 {'Header 2': 'Israel'},
 {'Header 2': 'Japan, The Netherlands, South Korea, and Taiwan'},
 {'Header 2': 'India'},
 {'Header 2': 'The emerging AI powers'},
 {'Header 2': 'The geopolitics of AI governance'},
 {'Header 2': 'The emerging AI powers'},
 {'Header 2': 'How technological development drives geopolitics'},
 {'Header 2': 'The key debate: scale up or scale down?'},
 {'Header 2': 'The future of AI governance'},
 {'Header 2': 'Charting the chip wars'},
 {'Header 2': 'AI and commerce'},
 {'Header 2': 'Energy'},
 {'Header 2': 'Compute'},
 {'Header 2': 'Data'},
 {'Header 2': 'Models'},
 {'Header 2': 'What to watch'}]

## Unstructured PDF reader

In [6]:
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader

loader = UnstructuredPDFLoader(
    "data_pdfs/tepco-tfcd-2023.pdf",
    # mode="elements",
)
data = loader.load()

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
)

html_header_splits = text_splitter.split_documents(data)
len(html_header_splits)

12

## `llmsherpa` Layout PDF reader
https://blog.llamaindex.ai/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125

In [19]:
# https://github.com/nlmatics/llmsherpa?tab=readme-ov-file#read-a-pdf-file
from llmsherpa.readers import LayoutPDFReader
from langchain.docstore.document import Document

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = "https://www.tepco.co.jp/en/hd/about/ir/library/integratedreport/pdf/2023TCFD-e.pdf" # also allowed is a file path e.g. /home/downloads/xyz.pdf
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)
html_header_splits = [Document(page_content=chunk.to_context_text()) for chunk in doc.chunks()]

# Vector store

In [3]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import SKLearnVectorStore

# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# embeddings = HuggingFaceEmbeddings(model_name="WhereIsAI/UAE-Large-V1")

encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5", encode_kwargs=encode_kwargs)

vector_store = SKLearnVectorStore.from_documents(html_header_splits, embeddings)

In [26]:
keywords = "japan".split(" ")
keywords = "目標 削減 50%".split(" ")
keywords = "target co2 emission reduction sales of power".split(" ")

found_docs = vector_store.similarity_search_with_relevance_scores(" ".join(keywords))

for doc in found_docs:
    for keyword in keywords:
        if keyword.lower() in doc[0].page_content.lower():
            print(f"* `{keyword}` included")
    print(doc)
    print()

* `target` included
* `co2` included
* `emission` included
* `reduction` included
* `of` included
* `power` included
(Document(page_content='Metrics and Target : GHG Emission Reduction > Achieving CO2 reduction targets\nThe TEPCO Group aims to reduce CO2 emissions originating from the sale of power by 50% of FY2013 levels by the year FY2030.', metadata={'id': '58486f19-1311-41e0-b1f6-fbb8db191ced'}), 0.8109984422679806)

* `target` included
* `reduction` included
* `of` included
* `power` included
(Document(page_content='TEPCO Group At a Glance > Real GDP （FY2022) > Metrics and Targets > GHG Reduction Target\nScope3 (the sale of power)', metadata={'id': 'e267b12e-3819-4123-8f41-28516345125e'}), 0.807420567403075)

* `co2` included
* `emission` included
* `of` included
* `power` included
(Document(page_content='TEPCO Group At a Glance > Real GDP （FY2022) > Carbon Neutrality Strategies > Carbon neutrality declaration > Reduce CO2 emissions originating from the sale of power to 50% of FY2

## POC for exact info extraction
* Extracting Accurate Materials Data from Research Papers with Conversational Language Models and Prompt Engineering
  * https://arxiv.org/abs/2303.05352
* Need to update the prompt text for Llama2 Chat model
  * add special chars?
  * put the context before the question (like a chat)

In [6]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import Ollama

llm = Ollama(model="llama2:7b-chat")

In [85]:
from langchain.prompts import PromptTemplate

prompt_1st = PromptTemplate.from_template('''<s>[INST]
---
{doc}
---

There is a possibility that the data you extracted is incorrect.
Answer "Yes" or "No" only. Be very strict. Is a full database of critical cooling rates the first compound for which the value of {property} is given the above text? Make sure it is a real compound.[/INST]
''')

chain = prompt_1st | llm

from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter

test_text = """This paper demonstrates that conversational LLMs
such as ChatGPT, with proper prompt engineering and
a series of follow-up questions, such as the ChatExtract
approach presented here, are capable of providing high
quality materials data extracted from research texts with
no additional fine-tuning, extensive code development
or deep knowledge about the property for which the
data is extracted. We present such a series of wellengineered prompts and follow-up questions in this paper and demonstrate its effectiveness resulting in a best
performance of over 90% precision at 87.7% recall on
our test set of bulk modulus data, and 91.6% precision
and 83.6% on a full database of critical cooling rates.
We show that the success of the ChatExtract method
lies in asking follow-up questions with purposeful redundancy and introduction of uncertainty and information
retention within the conversation by comparing to results when these aspects are removed. We further develop
two databases using ChatExtract - a database of critical cooling rates for metallic glasses and yield strengths
for high entropy alloys. The first one was modest-sized
and served as a benchmark for full database development
since we were able to compare it to data we extracted
manually. The second one was a large database, to our
knowledge the largest database of yield strength of high
entropy alloys to date. The high quality of the extracted
data and the simplicity of the approach suggests that
approaches similar to ChatExtract offer an opportunity
to replace previous, more labor intensive, methods. Since
ChatExtract is largely independent of the used model, it
is also likely improve by simply applying it to newer and
more capable LLMs as they are developed in the future.""".replace('\n', ' ')

docs = [Document(page_content=sentence) for sentence in test_text.split('. ')]
# print(docs)
for doc in docs[:3]:
    print(doc)
    print(chain.invoke({"property": "precision", "doc": doc.page_content}))
    print("---")

page_content='This paper demonstrates that conversational LLMs such as ChatGPT, with proper prompt engineering and a series of follow-up questions, such as the ChatExtract approach presented here, are capable of providing high quality materials data extracted from research texts with no additional fine-tuning, extensive code development or deep knowledge about the property for which the data is extracted'
 No.
---
page_content='We present such a series of wellengineered prompts and follow-up questions in this paper and demonstrate its effectiveness resulting in a best performance of over 90% precision at 87.7% recall on our test set of bulk modulus data, and 91.6% precision and 83.6% on a full database of critical cooling rates'
 No
---
page_content='We show that the success of the ChatExtract method lies in asking follow-up questions with purposeful redundancy and introduction of uncertainty and information retention within the conversation by comparing to results when these aspects a

> sentence
Answer "Yes" or "No" only. Does the above text contain a value of {property}?
Answer "Yes" or "No" only. Does the above text contain more than one of {property}?

> text
Give the number only without units, do not use a full sentence. If the value is not present, type "None". What is the value of {property} in the above text?
Give the unit only, do not use a full sentence. If the unit is not present, type "None". What is the unit of {property} in the above text?
Give the material name only, do not use a full sentence. If the material name is not present, type "None". What is the material name of {property} in the above text?

> multiple
Use only data present in the text. If data is not present in the text, type "None". Summarize the value of the {property} in the above text in a form of table consisting of: Material, Value, Unit.
There is a possibility that the data you extracted is incorrect. Answer "Yes" or "No" only. Be very strict. Is Bulk Modulus Data the first compound for which the value of {property} is given the above text? Make sure it is a real compound.

## LLM

In [19]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import Ollama

llm = Ollama(model="llama2:7b-chat")

In [24]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

check_template = """<s>[INST]Please check if the following paragraph is related to Japan and rate this relevance to whether low, middle, or high.

{docs}

Helpful Answer:"""

check_template = """<s>[INST]Please extract the sentences exactly regardng Japan.

{docs}

Helpful Answer:"""

check_prompt = PromptTemplate.from_template(check_template)
check_chain = LLMChain(llm=llm, prompt=check_prompt)

In [25]:
for doc in found_docs[:1]:
    print(doc)
    print("---")
    print(check_chain.run(doc))
    print("====")

(Document(page_content='South Korea, Japan, and Taiwan are home to some of the world’s most important semiconductor design and manufacturing companies, as well as semiconductor manufacturing equipment makers. They are also located in critical geographies for global supply chains along the South China Sea and East China Sea. The world’s great powers are dependent on these countries for their own technological competitiveness.  \nThe importance of these countries in multilateral cooperation has been clear for some time. In addition to their roles in the enforcement of export controls, in March 2022, US President Joe Biden reportedly proposed a “CHIP 4” grouping of the US and these three East Asian governments, a move seen as not only aimed at isolating Beijing, but also at creating greater supply-chain diversification and protecting companies’ intellectual property. However, the initiative has not yet made significant progress.  \nAs artificial intelligence becomes a greater part of dail