In [45]:
import os
import openai
import sys

env = {}

# read .env file and set variables
with open('.env') as f:
    for line in f:
        key, value = line.strip().split('=')
        env[key] = value

openai.api_key = env['OPEN_AI_KEY']
OPENAI_API_KEY = env['OPEN_AI_KEY']

In [2]:
pip install lark

Collecting lark
  Downloading lark-1.1.9-py3-none-any.whl.metadata (1.9 kB)
Downloading lark-1.1.9-py3-none-any.whl (111 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'

In [5]:
embedding = OpenAIEmbeddings(openai_api_key=env['OPEN_AI_KEY'])
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [6]:
print(vectordb._collection.count())

154


In [7]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [8]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [9]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [10]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

In [11]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [12]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [13]:
docs_ss[0].page_content[:100]

'Octave Tutorial \nComputing on data \nMachine Learning'

In [14]:
docs_ss[1].page_content[:100]

'Octave Tutorial \nVectorial implementation \nMachine Learning'

In [15]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

In [16]:
docs_mmr[0].page_content[:100]

'Octave Tutorial \nComputing on data \nMachine Learning'

In [17]:
docs_mmr[1].page_content[:100]

'Andrew\t\r \xa0Ng\t\r \xa0Matrix\t\r \xa0Addi4on'

In [18]:
question = "what did they say about regression in the third lecture?"

In [19]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"docs/Lecture3.pdf"}
)

In [20]:
for d in docs:
    print(d.metadata)

{'page': 18, 'source': 'docs/Lecture3.pdf'}
{'page': 0, 'source': 'docs/Lecture3.pdf'}
{'page': 8, 'source': 'docs/Lecture3.pdf'}


In [21]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [22]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/Lecture1.pdf`, `docs/Lecture2.pdf`, or `docs/Lecture3.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [24]:
pip install -U langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.1.14-py3-none-any.whl.metadata (2.5 kB)
Downloading langchain_openai-0.1.14-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.1.14

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
from langchain_openai import OpenAI

In [47]:
document_content_description = "Lecture notes"
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0, openai_api_key=env['OPEN_AI_KEY'])
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [28]:
question = "what did they say about regression in the third lecture?"

In [29]:
docs = retriever.get_relevant_documents(question)

  warn_deprecated(


In [30]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': 'docs/Lecture3.pdf'}
{'page': 22, 'source': 'docs/Lecture3.pdf'}
{'page': 18, 'source': 'docs/Lecture3.pdf'}
{'page': 8, 'source': 'docs/Lecture3.pdf'}


In [31]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [32]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [46]:
# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct", openai_api_key=env['OPEN_AI_KEY'])
compressor = LLMChainExtractor.from_llm(llm)

In [34]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [35]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

Machine Learning
----------------------------------------------------------------------------------------------------
Document 2:

Machine Learning


In [36]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [37]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

Machine Learning


In [38]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [39]:
# Load PDF
loader = PyPDFLoader("docs/Lecture1.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [41]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading 

In [42]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [43]:
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(page_content="Astronomical data analysis Market segmentation \n Andrew NgCocktail party problem \nMicrophone #1 \nMicrophone #2 Speaker #1 \nSpeaker #2 \n Andrew Ng [Audio clips courtesy of Te-Won Lee.] Microphone #1: Microphone #2: Microphone #1: Microphone #2: Output #1: Output #2: \nOutput #1: Output #2: \n Andrew NgCocktail party problem algorithm \n[W,s,v] = svd((repmat(sum(x.*x,1),size(x,1),1).*x)* x'); \n[Source: Sam Roweis, Yair Weiss & Eero Simoncelli] Of the following examples, which would you address using an \nunsupervised learning algorithm?  (Check all that ap ply.) \nGiven a database of customer data, automatically di scover market \nsegments and group customers into different market segments. Given email labeled as spam/not spam, learn a spam filter. \nGiven a set of news articles found on the web, grou p them into \nset of articles about the same story. Given a dataset of patients diagnosed as either hav ing diabetes or \nnot, learn to classify new patients as

In [44]:
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content='spam.  What is the task T in this setting? “A computer program is said to learn from experience E with respect to \nsome task T and some performance measure P , if its performance on T, \nas measured by P , improves with experience E.” \n Classifying emails as spam or not spam. Watching you label emails as spam or not spam. The number (or fraction) of emails correctly classi fied as spam/not spam. \nNone of the above—this is not a machine learning pr oblem. Suppose your email program watches which emails you  do or do \nnot mark as spam, and based on that learns how to b etter filter \nspam.  What is the task T in this setting? “A computer program is said to learn from experience E with respect to \nsome task T and some performance measure P , if its performance on T, \nas measured by P , improves with experience E.”  Andrew NgMachine learning algorithms: - Supervised learning - Unsupervised learning Others: Reinforcement learning, recommender \nsystems. \nAlso t