In [5]:
# %pip install -U unstructured unstructured-inference onnx pytesseract  chromadb
# pip install -U python-poppler

## Multi-vector retriever

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = 'mixed_data/element_of_SL.pdf'

loader = PyPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=0
)

sl_data = loader.load_and_split(text_splitter=text_splitter)
sl_data[:5]

[Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 0}, page_content='Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-\nnology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the approach is statistical, theemphasis is on concepts rather than mathematics. Many examples 

In [2]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(
    collection_name="statistical_learning",
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

  warn_deprecated(
  warn_deprecated(


MultiVectorRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001D1F9C76AE0>, docstore=<langchain_core.stores.InMemoryStore object at 0x000001D1F6DB3E90>)

In [3]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in sl_data]
doc_ids[:5]

['b6a30443-5efc-4efd-aa5a-12ecb088ebff',
 '7460ebe9-dd33-49c5-bebf-66a40732f22e',
 '31f767ce-bc85-4dcc-b16b-4c2c1468b0cf',
 'f7faad30-f6ce-415c-8763-b8edbf60d8f1',
 'ed015eef-34ca-4b39-8b0e-c9145a191e7c']

In [4]:
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for i, doc in enumerate(sl_data):
    doc_id = doc_ids[i]
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata[id_key] = doc_id
    all_sub_docs.extend(sub_docs)
    
all_sub_docs[:5]

[Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 0, 'doc_id': 'b6a30443-5efc-4efd-aa5a-12ecb088ebff'}, page_content='Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-'),
 Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 0, 'doc_id': 'b6a30443-5efc-4efd-aa5a-12ecb088ebff'}, page_content='nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different'),
 Document

In [5]:
retriever.vectorstore.add_documents(all_sub_docs)
retriever.docstore.mset(list(zip(doc_ids, sl_data)))

In [6]:
retriever.vectorstore.similarity_search("Linear regression")

[Document(metadata={'doc_id': '066bc2a3-2954-4f3b-8c24-0825a3ac7784', 'page': 118, 'source': 'mixed_data/element_of_SL.pdf'}, page_content='100 3. Linear Methods for Regression'),
 Document(metadata={'doc_id': 'c1257b88-c5bf-4189-9a30-e44cad19d87b', 'page': 72, 'source': 'mixed_data/element_of_SL.pdf'}, page_content='54 3. Linear Methods for Regression\nx1x2y\nˆ yz z z z z\nFIGURE 3.4. Least squares regression by orthogonalization of the inputs. The\nvector x2is regressed on the vector x1, leaving the residual vector z. The regres-\nsion of yonzgives the multiple regression coeﬃcient of x2. Adding together the\nprojections of yon each of x1andzgives the least squares ﬁt ˆy.'),
 Document(metadata={'doc_id': '28eb91a6-a299-4c3d-bbd9-6cac7f34540f', 'page': 222, 'source': 'mixed_data/element_of_SL.pdf'}, page_content='linear model\nf(X) =α(Z) +β1(Z)X1+≤≤≤+βq(Z)Xq. (6.16)\nFor given Z, this is a linear model, but each of the coeﬃcients can vary\nwithZ. It is natural to ﬁt such a model by lo

In [7]:
retriever.get_relevant_documents("Linear regression")

  warn_deprecated(


[Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 118}, page_content='100 3. Linear Methods for Regression'),
 Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 72}, page_content='54 3. Linear Methods for Regression\nx1x2y\nˆ yz z z z z\nFIGURE 3.4. Least squares regression by orthogonalization of the inputs. The\nvector x2is regressed on the vector x1, leaving the residual vector z. The regres-\nsion of yonzgives the multiple regression coeﬃcient of x2. Adding together the\nprojections of yon each of x1andzgives the least squares ﬁt ˆy.\nAlgorithm 3.1 Regression by Successive Orthogonalization.\n1. Initialize z0=x0=1.\n2. For j= 1,2,... ,p\nRegress xjonz0,z1,... ,,zj−1to produce coeﬃcients ˆ γℓj=\n⟨zℓ,xj⟩/⟨zℓ,zℓ⟩,ℓ= 0,... ,j −1 and residual vector zj=\nxj−∑j−1\nk=0ˆγkjzk.\n3. Regress yon the residual zpto give the estimate ˆβp.\nThe result of this algorithm is\nˆβp=⟨zp,y⟩\n⟨zp,zp⟩. (3.28)\nRe-arranging the residual in step 2, we can see that each

In [8]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0)

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    verbose=True
)

chain.run("What is linear regression?")

  warn_deprecated(
  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Linear regression is a statistical method used to model the relationship between a dependent variable and one or more independent variables by fitting a linear equation to the observed data. The goal of linear regression is to find the best-fitting line that describes the relationship between the variables. In this method, the regression function is assumed to be linear in the inputs, and the parameters of the linear equation are estimated using techniques like least squares to minimize the difference between the observed values and the predicted values.'

## Hypothetical queries

In [9]:
from langchain.chains import LLMChain
from langchain.output_parsers import NumberedListOutputParser

prompt = """
Generate a numbered list of 3 hypothetical questions that the below document could be used to answer:

{doc}
"""

llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-16k')

chain = LLMChain.from_string(
    llm=llm,
    template=prompt,
)

chain.verbose = True
chain.output_parser = NumberedListOutputParser()

chain.run(sl_data[20].page_content)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Generate a numbered list of 3 hypothetical questions that the below document could be used to answer:

2 1. Introduction
TABLE 1.1. Average percentage of words or characters in an email message
equal to the indicated word or character. We have chosen the wo rds and characters
showing the largest diﬀerence between spamandemail.
george you your hp free hpl ! our re edu remove
spam 0.00 2.26 1.38 0.02 0.52 0.01 0.51 0.51 0.13 0.01 0.28
email 1.27 1.27 0.44 0.90 0.07 0.43 0.11 0.18 0.42 0.29 0.01
measurements for a set of objects (such as people). Using this data we build
a prediction model, or learner , which will enable us to predict the outcome
for new unseen objects. A good learner is one that accurately predicts such
an outcome.
The examples above describe what is called the supervised learning prob-
lem. It is called “supervised” because of the presence of the outcome vari-
able to guide the learning pr

['What are the average percentages of words or characters in an email message that are equal to specific words or characters?',
 'How can the data in Table 1.1 be used to build a prediction model for classifying email messages as spam or email?',
 'What are some examples of real learning problems discussed in the book, and how do they relate to supervised and unsupervised learning?']

In [10]:
vectorstore = Chroma(
    collection_name="hypo-questions",
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in sl_data]

In [11]:
from langchain.schema.document import Document

question_docs = []
for i, doc in enumerate(sl_data):
    result = chain.run(doc.page_content)
    question_docs.extend([
        Document(
            page_content=s,
            metadata={id_key: doc_ids[i]}
        ) for s in result
    ])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Generate a numbered list of 3 hypothetical questions that the below document could be used to answer:

Springer Series in Statistics
Trevor Hastie
Robert TibshiraniJerome FriedmanSpringer Series in Statistics
The Elements of
Statistical Learning
Data Mining, Inference, and Prediction
The Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-
nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework. While the a

BadRequestError: Error code: 400 - {'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_request_error', 'param': 'prompt', 'code': 'invalid_prompt'}}

In [12]:
question_docs[:5]

[Document(metadata={'doc_id': '3310723c-02cc-4ba2-a042-6090282c3d54'}, page_content='What are the important ideas and concepts in the field of statistics, data mining, and machine learning?'),
 Document(metadata={'doc_id': '3310723c-02cc-4ba2-a042-6090282c3d54'}, page_content='What are some of the new topics and methods covered in the second edition of "The Elements of Statistical Learning"?'),
 Document(metadata={'doc_id': '3310723c-02cc-4ba2-a042-6090282c3d54'}, page_content='Who are the authors of "The Elements of Statistical Learning" and what are their contributions to the field of statistics and data mining?'),
 Document(metadata={'doc_id': '7bd1af61-cf27-4221-984f-20cedcab43e1'}, page_content='Who are the parents of Valerie and Patrick Hastie?'),
 Document(metadata={'doc_id': '7bd1af61-cf27-4221-984f-20cedcab43e1'}, page_content='Who are the parents of Vera and Sami Tibshirani?')]

In [13]:
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, sl_data)))

In [14]:
retriever.vectorstore.similarity_search("What is linear regression?")

[Document(metadata={'doc_id': 'f671344c-8a29-4c01-ae4f-2fad6d2001e2'}, page_content='How can linear methods be applied to regression problems?'),
 Document(metadata={'doc_id': 'e9bc5669-0511-4ffe-ae8b-e72dc3fd3f83'}, page_content='What is the formula for the linear regression model?'),
 Document(metadata={'doc_id': 'f671344c-8a29-4c01-ae4f-2fad6d2001e2'}, page_content='What are some linear methods that can be used for regression analysis?'),
 Document(metadata={'doc_id': 'f744d282-364e-46fd-a9e0-aace721f6a66'}, page_content='What is the linear model used for prediction?')]

In [15]:
retriever.get_relevant_documents("What is linear regression?")

[Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 118}, page_content='100 3. Linear Methods for Regression'),
 Document(metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 62}, page_content='44 3. Linear Methods for Regression\n3.2 Linear Regression Models and Least Squares\nAs introduced in Chapter 2, we have an input vector XT= (X1,X2,... ,X p),\nand want to predict a real-valued output Y. The linear regression model\nhas the form\nf(X) =β0+p∑\nj=1Xjβj. (3.1)\nThe linear model either assumes that the regression function E( Y|X) is\nlinear, or that the linear model is a reasonable approximation. Here the\nβj’s are unknown parameters or coeﬃcients, and the variables Xjcan come\nfrom diﬀerent sources:\n•quantitative inputs;\n•transformations of quantitative inputs, such as log, square-root or\nsquare;\n•basis expansions, such as X2=X2\n1,X3=X3\n1, leading to a polynomial\nrepresentation;\n•numeric or “dummy” coding of the levels of qualitative inputs. For\nex

In [16]:
llm = ChatOpenAI(temperature=0)

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    verbose=True
)

chain.run("What is linear regression?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Linear regression is a statistical method used to model the relationship between a dependent variable (Y) and one or more independent variables (X). In linear regression, the relationship is assumed to be linear, and the goal is to find the best-fitting line that describes how the dependent variable changes as the independent variable(s) change. The coefficients in the linear regression equation represent the impact of the independent variables on the dependent variable. The most common method used to estimate these coefficients is the least squares method, which minimizes the sum of the squared differences between the observed values and the values predicted by the linear model.'

In [23]:
# %pip install pillow_heif

## Parsing a multimodal document

In [None]:
from unstructured.partition.pdf import partition_pdf

path = "multimodal/"

raw_pdf_elements = partition_pdf(
    filename=path + "LLAVA.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000, 
    new_after_n_chars=3800, 
    combine_text_under_n_chars=2000,
    image_output_dir_path=path + 'images/'
)

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# raw_pdf_elements[20].text 

In [22]:
# table_elements = []
# text_elements = []
# for element in raw_pdf_elements:
#     if element.category == "Table":
#         table_elements.append(element.text)
#     elif element.category == "CompositeElement":
#         text_elements.append(element.text)

# print(len(table_elements))
# print(len(text_elements))

## Summarize

In [24]:
# prompt = """You are an assistant tasked with summarizing tables and text. 
# Give a concise summary of the table or text.

# Table or text chunk: {element}
# """ 

# model = ChatOpenAI(temperature=0, model_name='gpt-4')
# summarize_chain = LLMChain.from_string(
#     llm=model,
#     template=prompt
# )

In [None]:
# table_summaries = summarize_chain.batch(table_elements)
# text_summaries = summarize_chain.batch(text_elements)

In [None]:
# table_summaries

## Images with LLAVA

In [None]:
# %%bash

# # Define the directory containing the images
# IMG_DIR=~/Projects/Teaching/Introduction_Langchain/projects/live/multimodal/images/
# TEXT_DIR=~/Projects/Teaching/Introduction_Langchain/projects/live/multimodal/text/

# # Loop through each image in the directory
# for img in "${IMG_DIR}"*.jpg; do
#     # Extract the base name of the image without extension
#     base_name=$(basename "$img" .jpg)

#     # Define the output file name based on the image name
#     output_file="${TEXT_DIR}${base_name}.txt"

#     # Execute the command and save the output to the defined output file
#     ~/Projects/Teaching/Introduction_Langchain/projects/live/multimodal/llama.cpp/build/bin/llava \
#     -m ~/Projects/Teaching/Introduction_Langchain/projects/live/multimodal/llama.cpp/models/ggml-model-q5_k.gguf \
#     --mmproj ~/Projects/Teaching/Introduction_Langchain/projects/live/multimodal/llama.cpp/models/mmproj-model-f16.gguf \
#     --temp 0.1 \
#     -p "Describe the image in detail. Be specific about graphs, such as bar plots." \
#     --image "$img" > "$output_file"

# done

In [None]:
# import glob
# from PIL import Image

# text_path = "multimodal/text/"
# images_path = "multimodal/images/"

# text_list = sorted(glob.glob(text_path + "*.txt"))
# img_list = sorted(glob.glob(images_path + "*.jpg"))

# logging_header="clip_model_load: total allocated memory: 201.27 MB\n\n"
# appendix='main: image encoded in'

# # Read each file and store its content in a list
# img_summaries = []
# for i, text_path in enumerate(text_list):
#     with open(text_path, 'r') as file:
#         summary = file.read()
    
#     summary = summary.split(logging_header, 1)[1].strip()
#     summary = summary.split(appendix, 1)[0].strip()
    
#     img_path = img_list[i]
#     img = Image.open(img_path)
    
#     img_summaries.append({
#         'summary': summary,
#         'image': img
#     })

In [None]:
# from IPython.display import display

# for img_dict in img_summaries:
#     display(img_dict['image'])
#     print(img_dict['summary'])

## Index data to database

In [1]:
# def get_docs(text_list, ids):
#     return [
#         Document(
#             page_content=s, 
#             metadata={id_key: ids[i]}
#         ) for i, s in enumerate(text_list)
#     ]

# doc_ids = [str(uuid.uuid4()) for _ in text_summaries]
# text_docs = get_docs(
#     [t['element'] for t in text_summaries], 
#     doc_ids
# )
# summary_text_docs = get_docs(
#     [t['text'] for t in text_summaries], 
#     doc_ids
# )

# table_ids = [str(uuid.uuid4()) for _ in table_summaries]
# table_docs = get_docs(
#     [t['element'] for t in table_summaries], 
#     table_ids
# )
# summary_table_docs = get_docs(
#     [t['text'] for t in table_summaries], 
#     table_ids
# )

# img_ids = [str(uuid.uuid4()) for _ in img_summaries]
# img_summary_docs = get_docs(
#     [i['summary'] for i in img_summaries], 
#     img_ids
# )


In [2]:
# vectorstore = Chroma(
#     collection_name="llava_pdf",
#     embedding_function=OpenAIEmbeddings()
# )

# store = InMemoryStore()

# retriever = MultiVectorRetriever(
#     vectorstore=vectorstore, 
#     docstore=store, 
#     id_key=id_key,
# )

# retriever.vectorstore.add_documents(summary_text_docs)
# retriever.docstore.mset(list(zip(doc_ids, text_docs)))

# retriever.vectorstore.add_documents(summary_table_docs)
# retriever.docstore.mset(list(zip(table_ids, table_docs)))

# retriever.vectorstore.add_documents(img_summary_docs)
# retriever.docstore.mset(list(zip(img_ids, img_summary_docs)))

In [3]:
# retriever.vectorstore.similarity_search("What is specific about LLava?")

In [4]:
# retriever.get_relevant_documents("What is specific about LLava?")

## Multimodal RAG

In [5]:
# llm = ChatOpenAI(temperature=0, model_name='gpt-4')

# chain = RetrievalQA.from_chain_type(
#     llm=llm, 
#     retriever=retriever,
#     verbose=True
# )

In [6]:
# chain.run('What makes LLava different GPT-4')

In [7]:
# chain.run('What is the architecture of the LLava model?')