In [33]:
import os
import re
import getpass
import redis
from typing import List, Dict
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Redis
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import (
    get_query_constructor_prompt,
    load_query_constructor_runnable,
    AttributeInfo)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [4]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["LANGCHAIN_PROJECT"] = "html-chunking"
os.environ['TEXT_SOURCE'] = 'https://plato.stanford.edu/entries/goedel/'

OpenAI API Key: ········


In [14]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JackMiller\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JackMiller\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
def clean_text(content):
    tokens = word_tokenize(content)
    cleaned = ' '.join([t for t in tokens if t not in stopwords.words('english')])
    return cleaned

In [90]:
llm = ChatOpenAI(model='gpt-3.5-turbo',
                temperature=0)

embeddings = OpenAIEmbeddings(show_progress_bar=True)

loader = UnstructuredURLLoader([os.getenv('Text_SOURCE')])

splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                         chunk_overlap=0)

web_text = loader.load()
web_text[0].page_content = clean_text(web_text[0].page_content)
docs = splitter.split_documents(web_text)

Create vectorstore and QA retriever

In [91]:
vector_store = Redis.from_documents(docs,
                                   embeddings,
                                   redis_url='redis://localhost:6379')

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.04s/it]


In [98]:
retriever = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=vector_store.as_retriever(),
                                       return_source_documents=True)

In [99]:
def print_result(response_obj):
    print("SOURCES: \n")
    cnt = 1
    for source_doc in response_obj["source_documents"]:
        print(f"Chunk #{cnt}")
        cnt += 1
        print("Source Metadata: ", source_doc.metadata)
        print("Source Text:")
        print(source_doc.page_content)
        print("\n")
    print("RESULT: \n")
    print(response_obj["result"] + "\n\n")

In [100]:
query = "explain godel's first incompleteness theorem."
response = retriever({"query":query})

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\callbacks\manager.py", line 1944, in _configure
    handler = LangChainTracer(project_name=tracer_project)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\tracers\langchain.py", line 96, in __init__
    self.client = client or get_client()
                            ^^^^^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\tracers\langchain.py", line 56, in get_client
    _CLIENT = Client()
              ^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langsmith\client.py", line 378, in __init__
    _validate_api_key_if_hosted(self.api_url, self.api_key)
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langsmith\client.py", line 245, in _validate_api_key_if_hosted
    raise ls_utils.Lan

SOURCES: 

Chunk #1
Source Metadata:  {'id': 'doc:5d1139e2b197483b9e83d64ed19f1cd1:d92e24c029384f8b96447dbfd07e65ed', 'source': 'https://plato.stanford.edu/entries/goedel/'}
Source Text:
completeness . As aside , von Neumann understood two theorems way , even Gödel . In fact von Neumann went much taking view showed infeasibility classical mathematics altogether . As wrote Carnap June 1931 : 9 ] And previous fall von Neumann written Gödel even stronger terms : It would take Gödel years see aspects Hilbert Program decisively refuted results ( Mancosu 2004 ) . 2.2.1 The First Incompleteness Theorem In Logical Journey ( Wang 1996 ) Hao Wang published full text material Gödel written ( Wang ’ request ) discovery incompleteness theorems . This material formed basis Wang ’ “ Some Facts Kurt Gödel , ” read approved Gödel : We see Gödel first tried reduce consistency problem analysis arithmetic . This seemed require truth definition arithmetic , turn led paradoxes , Liar paradox ( “ This senten

In [101]:
print_result(response)

SOURCES: 

Chunk #1
Source Metadata:  {'id': 'doc:5d1139e2b197483b9e83d64ed19f1cd1:d92e24c029384f8b96447dbfd07e65ed', 'source': 'https://plato.stanford.edu/entries/goedel/'}
Source Text:
completeness . As aside , von Neumann understood two theorems way , even Gödel . In fact von Neumann went much taking view showed infeasibility classical mathematics altogether . As wrote Carnap June 1931 : 9 ] And previous fall von Neumann written Gödel even stronger terms : It would take Gödel years see aspects Hilbert Program decisively refuted results ( Mancosu 2004 ) . 2.2.1 The First Incompleteness Theorem In Logical Journey ( Wang 1996 ) Hao Wang published full text material Gödel written ( Wang ’ request ) discovery incompleteness theorems . This material formed basis Wang ’ “ Some Facts Kurt Gödel , ” read approved Gödel : We see Gödel first tried reduce consistency problem analysis arithmetic . This seemed require truth definition arithmetic , turn led paradoxes , Liar paradox ( “ This senten

In [102]:
headers_to_split_on = [
    ("h1", "article_h1_main"),
    ("h2", "article_h2_subsection"),
    ("h3", "article_h3_subsection"),
    ("h4", "article_h4_subsection"),
]

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    return_each_element=False)

docs = html_splitter.split_text_from_url('https://plato.stanford.edu/entries/goedel/' )

for attr in docs[12]:
    print(attr, "\n")

('page_content', 'Gödel’s proof of the consistency of the continuum hypothesis with the axioms of Zermelo-Fraenkel set theory is a tour de force and arguably the greatest achievement of his mathematical life. This is because aside from the arithmetization, virtually all of the technical machinery used in the proof had to be invented ab initio.  \nThe Continuum Hypothesis (henceforth CH) was formulated by Georg Cantor, and was the first problem on Hilbert’s list of twenty-three unsolved problems as given in his famous address to the International Mathematical Congress in Paris in 1900. The problem as stated by Hilbert is as follows: Let A be an infinite set of real numbers. Then A is either countable, or has cardinality 2ℵ0, i.e., A is in one-to-one correspondence either with the set of natural numbers or with the set of all real numbers (otherwise known as the continuum). Another way to state the continuum hypothesis is that (the first uncountably infinite cardinal) ℵ1 = 2ℵ0.  \nAs ear

In [103]:
for doc in docs:
    if doc.metadata:
        print(doc.metadata, "\n")

{'article_h1_main': 'Kurt Gödel'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '1. Biographical Sketch'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work', 'article_h3_subsection': '2.1 The Completeness Theorem', 'article_h4_subsection': '2.1.1 Introduction'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work', 'article_h3_subsection': '2.1 The Completeness Theorem', 'article_h4_subsection': '2.1.2 Proof of the Completeness Theorem'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work', 'article_h3_subsection': '2.1 The Completeness Theorem', 'article_h4_subsection': '2.1.3 An Important Consequence of the Completeness Theorem'} 

{'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work', 'article_h3_subsection': '2.2 The Incompl

In [106]:
def build_index_schema(documents: List[Document]) -> Dict:
    schema = {"text": []}
    for doc in documents:
        for key in doc.metadata:
            name_dict = {"name": f"{key}"}
            if name_dict not in schema["text"]:
                schema["text"].append(name_dict)
    return schema

index_schema = build_index_schema(docs)
print(index_schema)

{'text': [{'name': 'article_h1_main'}, {'name': 'article_h2_subsection'}, {'name': 'article_h3_subsection'}, {'name': 'article_h4_subsection'}]}


In [107]:
vectorstore = Redis.from_documents(
    docs,
    embeddings,
    redis_url='redis://localhost:6379',
    index_schema = index_schema
)
retriever = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

`index_schema` does not match generated metadata schema.
If you meant to manually override the schema, please ignore this message.
index_schema: {'text': [{'name': 'article_h1_main'}, {'name': 'article_h2_subsection'}, {'name': 'article_h3_subsection'}, {'name': 'article_h4_subsection'}]}
generated_schema: {'text': [], 'numeric': [], 'tag': []}

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.13s/it]


In [108]:
query = "explain godel's first incompleteness theorem."
response = retriever({"query": query})

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\callbacks\manager.py", line 1944, in _configure
    handler = LangChainTracer(project_name=tracer_project)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\tracers\langchain.py", line 96, in __init__
    self.client = client or get_client()
                            ^^^^^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langchain_core\tracers\langchain.py", line 56, in get_client
    _CLIENT = Client()
              ^^^^^^^^
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langsmith\client.py", line 378, in __init__
    _validate_api_key_if_hosted(self.api_url, self.api_key)
  File "C:\Users\JackMiller\anaconda3\envs\LLM\Lib\site-packages\langsmith\client.py", line 245, in _validate_api_key_if_hosted
    raise ls_utils.Lan

In [110]:
print_result(response)

SOURCES: 

Chunk #1
Source Metadata:  {'id': 'doc:009238b9df654e079fa106ac9bd23353:39da3bf11eed4a109c9efe17bcd0031e', 'article_h1_main': 'Kurt Gödel', 'article_h2_subsection': '2. Gödel’s Mathematical Work', 'article_h3_subsection': '2.2 The Incompleteness Theorems', 'article_h4_subsection': '2.2.3 The Second Incompleteness Theorem'}
Source Text:
The Second Incompleteness Theorem establishes the unprovability, in number theory, of the consistency of number theory. First we have to write down a number-theoretic formula that expresses the consistency of the axioms. This is surprisingly simple. We just let Con(P) be the sentence ¬Prov(⌈0 = 1⌉).  
Theorem 4 (Gödel’s Second Incompleteness Theorem) If P is consistent, then Con(P) is not provable from P.  
Proof: Let φ be as in (3). The reasoning used to infer ‘if P ⊢ φ, then P ⊢ 0 ≠ 1‘ does not go beyond elementary number theory, and can therefore, albeit with a lot of effort (see below), be formalized in P. This yields: P ⊢ (Prov(⌈φ⌉) → ¬Co