# Retrieval Augmented Generation


<img width="1047" alt="Screenshot 2024-04-15 at 12 24 00 PM" src="https://github.com/harrywang/langchain-short-course/assets/595772/0c3cdde0-831d-4e11-91f2-670abc580d89">

In [1]:
import os
import openai
import sys

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

llm_name = "gpt-4-turbo"

# Load PDF
Each page is a `Document`.

A `Document` contains text (`page_content`) and `metadata`.

In [2]:
# load pdf
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/nba-rules-2023.pdf")
pages = loader.load()

In [3]:
len(pages)

74

In [4]:
# content people don't see
page = pages[0]
page.page_content

'This Page Intentionally Left Blank  \nIt is here to hold a place for cover for screen version.  \nDO NOT INCLUDE AS PART OF PRINT FILE!\nOFFICIAL\nRULES'

In [5]:
page = pages[1]
page.page_content

'- 2 -RULES INDEX\n RULE SECTION ARTICLE PAGE\nBACKCOURT/FRONTCOURT \n\t Definitions  .......................................  4 VI a & b 19\n Eight (8) Second Violation  ..............  4 VI f 19\n Player Position Status  ......................  4 VI c 19\n Ball Position Status  ..........................  4 VI d, e, & g 19\nBALL\n Dead Ball  .........................................  6 IV a 26\n Jump—Center Circle  .......................  6 V  26\n Jump—Free Throw Circle  ...............  6 VI  27\n Live Ball  ..........................................  6 II  26\n Putting in Play  .................................  6 I  25\n Restrictions  ......................................  6 VII  27\n Starting of Games & Overtime(s)  .... 6 I a 25\n Starting of 2nd, 3rd, & 4th Periods  .. 6 I b 25\nBASKET RING, BACKBOARD, SUPPORT\n\t Definition  .........................................  4 I  17\n Hanging (Intentional) .......................  12A  IV  40\n Hanging (Prevent Injury)  ...........

In [6]:
page.metadata

{'source': 'data/nba-rules-2023.pdf', 'page': 1}

# Document Splitter


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
chunk_size = 26  # characters not letters
chunk_overlap = 3

In [9]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [10]:
text1 = 'abcdefghijklmnopqrstuvwxyz'  # 26 characters
r_splitter.split_text(text1) # character based

['abcdefghijklmnopqrstuvwxyz']

In [11]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text2)  # check out the overlap of 3

['abcdefghijklmnopqrstuvwxyz', 'xyzabcdefghijklmnopqrstuvw', 'uvwxyz']

In [12]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [13]:
len(some_text)

496

In [14]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=30,
    chunk_overlap=10, 
)

r_splitter.split_text(some_text)

['When writing documents,',
 'writers will use document',
 'document structure to group',
 'to group content. This can',
 'This can convey to the',
 "to the reader, which idea's",
 "idea's are related. For",
 'For example, closely related',
 'related ideas are in',
 'are in sentances. Similar',
 'Similar ideas are in',
 'are in paragraphs. Paragraphs',
 'form a document.',
 'Paragraphs are often',
 'are often delimited with a',
 'with a carriage return or two',
 'or two carriage returns.',
 'returns. Carriage returns are',
 'are the "backslash n" you see',
 'you see embedded in this',
 'in this string. Sentences',
 'Sentences have a period at',
 'period at the end, but also,',
 'but also, have a space.and',
 'space.and words are separated',
 'separated by space.']

In [15]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)
docs = text_splitter.split_documents(pages)

In [16]:
len(docs), len(pages)

(276, 74)

In [17]:
docs

[Document(page_content='This Page Intentionally Left Blank  \nIt is here to hold a place for cover for screen version.  \nDO NOT INCLUDE AS PART OF PRINT FILE!\nOFFICIAL\nRULES', metadata={'source': 'data/nba-rules-2023.pdf', 'page': 0}),
 Document(page_content='- 2 -RULES INDEX\n RULE SECTION ARTICLE PAGE\nBACKCOURT/FRONTCOURT \n\t Definitions  .......................................  4 VI a & b 19\n Eight (8) Second Violation  ..............  4 VI f 19\n Player Position Status  ......................  4 VI c 19\n Ball Position Status  ..........................  4 VI d, e, & g 19\nBALL\n Dead Ball  .........................................  6 IV a 26\n Jump—Center Circle  .......................  6 V  26\n Jump—Free Throw Circle  ...............  6 VI  27\n Live Ball  ..........................................  6 II  26\n Putting in Play  .................................  6 I  25\n Restrictions  ......................................  6 VII  27\n Starting of Games & Overtime(s)  ...

# Embedding

In [18]:
# embeddings
# https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
# By default, the length of the embedding vector will be 1536 for text-embedding-3-small or 
# 3072 for text-embedding-3-large. 
# text-embedding-3-small	$0.02 / 1M tokens
# text-embedding-3-large	$0.13 / 1M tokens

from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [19]:
sentence1 = "i like dogs"
sentence2 = "i like cats"
sentence3 = "the weather is sweet outside"

In [20]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [21]:
len(embedding1)

1536

In [22]:
embedding1

[0.016534641512257267,
 -0.0333399700702634,
 -7.30805419316741e-06,
 0.006355573006430424,
 0.02788105814742452,
 -0.011978027502936138,
 -0.0076864651064215365,
 0.037197303313023594,
 -0.07277046945039266,
 -0.022072505375643876,
 -0.004632744508107354,
 -0.00961513079647905,
 -0.016455689571709543,
 0.02463278007082511,
 -0.0026843415314048457,
 0.02355002071195864,
 -0.04177647488659799,
 -0.022771787073527394,
 0.024813240584851244,
 0.0009128734888874846,
 -0.009727917686422801,
 0.050709240528568955,
 -0.005143107885938316,
 -0.010833234609542539,
 -0.0029804085144910656,
 0.028174305551394406,
 0.027813386385987304,
 0.01961373925394105,
 0.00658678734153447,
 -0.021429617122732232,
 -0.010573824017613845,
 -0.010680971749324926,
 -0.010686610907557596,
 -0.019974658419348153,
 0.051250618345357025,
 -0.013365313513560417,
 -0.017594842375547887,
 -0.017312875150688512,
 -0.00014353965635921197,
 0.04491196607325365,
 -0.012666030884354315,
 -0.03309183965744522,
 0.0404004681

In [23]:
import numpy as np
np.dot(embedding1, embedding2), np.dot(embedding1, embedding3), np.dot(embedding2, embedding3)

(0.7222818700424505, 0.2677958372908107, 0.2573165872157418)

#  Vector Store

In [24]:
# load and split pdf

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/nba-rules-2023.pdf")
pages = loader.load()  # load the pdf by pages

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
)

# split pages into smaller chunks
splits = text_splitter.split_documents(pages)

len(pages), len(splits)

(74, 571)

In [25]:
splits

[Document(page_content='This Page Intentionally Left Blank  \nIt is here to hold a place for cover for screen version.  \nDO NOT INCLUDE AS PART OF PRINT FILE!\nOFFICIAL\nRULES', metadata={'source': 'data/nba-rules-2023.pdf', 'page': 0}),
 Document(page_content='- 2 -RULES INDEX\n RULE SECTION ARTICLE PAGE\nBACKCOURT/FRONTCOURT \n\t Definitions  .......................................  4 VI a & b 19\n Eight (8) Second Violation  ..............  4 VI f 19\n Player Position Status  ......................  4 VI c 19\n Ball Position Status  ..........................  4 VI d, e, & g 19\nBALL\n Dead Ball  .........................................  6 IV a 26\n Jump—Center Circle  .......................  6 V  26\n Jump—Free Throw Circle  ...............  6 VI  27', metadata={'source': 'data/nba-rules-2023.pdf', 'page': 1}),
 Document(page_content='Jump—Free Throw Circle  ...............  6 VI  27\n Live Ball  ..........................................  6 II  26\n Putting in Play  ...........

In [26]:
!rm -rf ./data/chroma  # remove old database files if any

# you may need to restart the notebook to reset memory

from langchain.vectorstores import Chroma
persist_directory = './data/chroma/'

# calculate embeddings and store in database
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [27]:
print(vectordb._collection.count())

571


In [28]:
question = "what's a foul in basketball?"
docs = vectordb.similarity_search(question, k=3)
docs

[Document(page_content='personal\tfoul\tis\tcommitted\t on\tany\toffensive\t player\tduring\this\tteam’s\ttransition\t scoring\t\nopportunity; (ii) when the foul occurs, the ball is ahead the tip of the circle in the \nbackcourt,\t no\tdefender\t is\tahead\tof\tthe\toffensive\t player\twith\tthe\tscoring\topportunity \t\nand\tthat\toffensive\t player\tis\tin\tcontrol\tof\tthe\tball\tor\ta\tpass\tto\thim\thas\tbeen\treleased;\t and\t\n(iii)\tthe\tdefensive\t foul\tdeprives\tthe\toffensive\t team\tof\ta\ttransition\t scoring\topportunity.', metadata={'page': 44, 'source': 'data/nba-rules-2023.pdf'}),
 Document(page_content='(1) When the foul was called, the ball is ahead of the tip of the circle in the backcourt, \nno\tdefender\t is\tahead\tof\tthe\toffensive\t player\twith\tthe\tscoring\topportunity\t and\tthat\t\noffensive\tplayer\tis\tin\tcontrol\tof\tthe\tball\tor\ta\tpass\tto\thim\thas\tbeen\treleased.\n(2) The\tfoul\tdeprived\tthe\toffensive\tteam\tof\ta\ttransition\tscoring\toppor

# Q&A


In [29]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name=llm_name, temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [30]:
result = qa_chain.invoke({"query": "what's a foul in basketball?"})
#result = qa_chain.invoke({"query": "how long is the overtime?"})

result["result"]

'A foul in basketball is an infraction of the rules concerning illegal physical contact with an opponent. It can vary from personal fouls involving physical contact during the play, to technical fouls typically for unsportsmanlike conduct, and other specific types such as offensive fouls, flagrant fouls, and loose ball fouls. Each type of foul can affect the game differently, leading to free throws, possession changes, or other penalties depending on the nature of the foul and the rules of the specific basketball league.'

In [31]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    #verbose=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

result = qa_chain.invoke({"query": "what is goaltending?"})
result["result"]

'Goaltending is a violation in basketball that occurs when a defensive player illegally interferes with a shot on its downward flight towards the basket, or when any player touches the ball after it has touched the backboard while still having a chance to score. This rule is intended to prevent players from altering the natural trajectory of a shot as it heads towards the goal. Thanks for asking!'

In [32]:
result["source_documents"]

[Document(page_content='goal line prior to the attempt.\n(2) The\tshooter\tmay\tnot\tbe\ttouching\t the\tfloor\ton\tor\tinside\tthe\tthree-point\t field\t \ngoal line.\n(3) The\tshooter\tmay\tcontact\tthe\tthree-point\t field\tgoal\tline,\tor\tland\tin\tthe\ttwo-point \t\nfield\tgoal\tarea,\tafter\tthe\tball\tis\treleased.\nd. A\tfield\tgoal\taccidentally \tscored\tin\tan\topponent’s \tbasket\tshall\tbe\tadded\tto\tthe\t\nopponent’s score, credited to the opposing player nearest the player whose actions caused the \nball to enter the basket.', metadata={'page': 20, 'source': 'data/nba-rules-2023.pdf'}),
 Document(page_content='when the player’s shoulders start upward). It is not essential that the ball leave the shooter’s \nhand. His arm(s) might be held so that he cannot actually make an attempt.\nThe\tterm\tis\talso\tused\tto\tinclude\tthe\tflight\tof\tthe\tball\tuntil\tit\tbecomes\tdead\tor\tis\ttouched  \nby\ta\tplayer.\tA\ttap\tduring\ta\tjump\tball\tor\trebound\tis\tnot\tconsider

# Retrieval

In [33]:
from langchain.vectorstores import Chroma

small_db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

texts = [
    """Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry""",
    """Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry""",
    """Five Forces include: Competitive Rivalry, Supplier Power, Buyer Power, Threat of Substitution, and Threat of New Entry.""",
    """The Four C Model of Creativity categorizes creative expression into four levels: mini-c (personal creative learning), little-c (creativity appreciated by others), Pro-c (professional-level creativity), and Big-C (historically significant creativity). This model emphasizes creativity as a lifelong process, important at all stages of personal and professional development.""",
]

small_db = Chroma.from_texts(texts, embedding=embedding)

print(small_db._collection.count())  

# given Chroma in-memory
# should be 4 nothing loaded yet or you have to clean the folder


4


In [34]:
question = "Tell me about porter's five forces model"

small_db.similarity_search(question, k=3)  # duplicates

[Document(page_content="Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry"),
 Document(page_content="Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry"),
 Document(page_content='Five Forces include: Competitive Rivalry, Supplier Power, Buyer Power, Threat of Substitution, and Threat of New Entry.')]

In [35]:
# Addressing Diversity using Maximum marginal relevance: duplicate is gone

small_db.max_marginal_relevance_search(question, k=3, fetch_k=4)

[Document(page_content="Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry"),
 Document(page_content='Five Forces include: Competitive Rivalry, Supplier Power, Buyer Power, Threat of Substitution, and Threat of New Entry.'),
 Document(page_content='The Four C Model of Creativity categorizes creative expression into four levels: mini-c (personal creative learning), little-c (creativity appreciated by others), Pro-c (professional-level creativity), and Big-C (historically significant creativity). This model emphasizes creativity as a lifelong process, important at all stages of personal and professional development.')]

In [36]:
question = "Tell me about creativity models"
small_db.similarity_search(question, k=2)

[Document(page_content='The Four C Model of Creativity categorizes creative expression into four levels: mini-c (personal creative learning), little-c (creativity appreciated by others), Pro-c (professional-level creativity), and Big-C (historically significant creativity). This model emphasizes creativity as a lifelong process, important at all stages of personal and professional development.'),
 Document(page_content="Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry")]

In [37]:
# cosine distance, short the better
small_db.similarity_search_with_score(question, k=2)

[(Document(page_content='The Four C Model of Creativity categorizes creative expression into four levels: mini-c (personal creative learning), little-c (creativity appreciated by others), Pro-c (professional-level creativity), and Big-C (historically significant creativity). This model emphasizes creativity as a lifelong process, important at all stages of personal and professional development.'),
  0.6910927295684814),
 (Document(page_content="Michael Porter's five-force strategic analysis model, introduced in a 1979 article published in the Harvard Business Review, remains a fundamental tool for strategic analysts plotting the competitive landscape of an industry"),
  1.3747798204421997)]