In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [3]:
!pip show langchain

Name: langchain
Version: 0.0.345
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /scratch/qualis/miniconda3/envs/langchain2/lib/python3.10/site-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [5]:
print(vectordb._collection.count())

209


In [6]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [7]:
docs

[Document(page_content="statistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but didn't have time in the main \nlectures for.", metadata={'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}),
 Document(page_content="statistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but didn't have time in the main \nlectures f

In [8]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [9]:
from langchain.chains import RetrievalQA

In [10]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [11]:
result = qa_chain({"query": question})

In [12]:
result

{'query': 'What are major topics for this class?',
 'result': 'The major topics for this class are machine learning and its various applications.'}

In [13]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [14]:
QA_CHAIN_PROMPT

PromptTemplate(input_variables=['context', 'question'], template='Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. \n{context}\nQuestion: {question}\nHelpful Answer:')

In [15]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [16]:
result = qa_chain({"query": question})
result

{'query': 'What are major topics for this class?',
 'result': 'The major topics for this class are machine learning and its extensions. Thanks for asking!',
 'source_documents': [Document(page_content="statistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'm teaching in the main lectur es. So machine learning is a huge field, and \nthere are a few extensions that we really want  to teach but didn't have time in the main \nlectures for.", metadata={'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'}),
  Document(page_content="statistics for a while or maybe algebra, we'll go over those in the discussion sections as a \nrefresher for those of you that want one.  \nLater in this quarter, we'll also use the disc ussion sections to go over extensions for the \nmaterial that I'

In [17]:
question = "Is probability a class topic?"

In [18]:
result = qa_chain({"query": question})

In [19]:
result

{'query': 'Is probability a class topic?',
 'result': 'Yes, probability is a topic that will be covered in the class. Thanks for asking!',
 'source_documents': [Document(page_content="of this class will not be very program ming intensive, although we will do some \nprogramming, mostly in either MATLAB or Octa ve. I'll say a bit more about that later.  \nI also assume familiarity with basic proba bility and statistics. So most undergraduate \nstatistics class, like Stat 116 taught here at Stanford, will be more than enough. I'm gonna \nassume all of you know what ra ndom variables are, that all of you know what expectation \nis, what a variance or a random variable is. And in case of some of you, it's been a while \nsince you've seen some of this material. At some of the discussion sections, we'll actually \ngo over some of the prerequisites, sort of as  a refresher course under prerequisite class. \nI'll say a bit more about that later as well.  \nLastly, I also assume familiarity with

In [20]:
result["result"]

'Yes, probability is a topic that will be covered in the class. Thanks for asking!'

In [21]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [22]:
result = qa_chain_mr({"query": question})

In [23]:
result

{'query': 'Is probability a class topic?',
 'result': 'Based on the provided information, it is not clear whether probability is a specific topic covered in the class.'}

In [24]:
qa_chain_rf = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_rf({"query": question})
result["result"]

'Based on the additional context provided, it is still not explicitly mentioned whether probability is a specific class topic. The instructor mentions going over statistics and algebra in the discussion sections as a refresher, but it is not clear if probability is included in this review. Additionally, the instructor mentions using the discussion sections to cover extensions for the material taught in the main lectures, but it is not specified if probability is one of these extensions. Therefore, the original answer still stands.'

In [25]:
result

{'query': 'Is probability a class topic?',
 'result': 'Based on the additional context provided, it is still not explicitly mentioned whether probability is a specific class topic. The instructor mentions going over statistics and algebra in the discussion sections as a refresher, but it is not clear if probability is included in this review. Additionally, the instructor mentions using the discussion sections to cover extensions for the material taught in the main lectures, but it is not specified if probability is one of these extensions. Therefore, the original answer still stands.'}

In [26]:
qa_chain_maprerank = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_rerank"
)
result = qa_chain_maprerank({"query": question})
result["result"]



'Yes, probability is a class topic. The instructor assumes familiarity with basic probability and statistics. '

In [27]:
qa_chain_maprerank = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_rerank"
)
result = qa_chain_maprerank({"query": question})
result["result"]



'Yes, probability is a class topic. The speaker assumes familiarity with basic probability and statistics. They mention that most undergraduate statistics classes will be more than enough. They also mention that they will go over some of the prerequisites in the discussion sections as a refresher course. Therefore, probability is a topic that will be covered in the class.'

In [28]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [29]:
question = "Is probability a class topic?"
result = qa_chain({"query": question})
result["result"]

'Yes, probability is a topic that will be covered in this class. The instructor assumes familiarity with basic probability and statistics, so it is expected that students have prior knowledge of random variables, expectation, variance, and other related concepts.'

In [30]:
question = "why are those prerequesites needed?"
result = qa_chain({"query": question})
result["result"]

'The prerequisites are needed because they provide the foundational knowledge and skills necessary to understand and apply the concepts and techniques taught in the class. \n\nBasic knowledge of computer science and computer skills is important because machine learning algorithms often involve programming and working with data. Understanding big-O notation is essential for analyzing the efficiency and scalability of algorithms.\n\nFamiliarity with probability and statistics is necessary because machine learning involves making predictions and analyzing data, which requires understanding concepts such as random variables, expectation, and variance.\n\nBasic knowledge of linear algebra is important because many machine learning algorithms involve manipulating matrices and vectors. Understanding concepts such as matrix multiplication, matrix inverse, and eigenvectors is crucial for working with these algorithms.\n\nOverall, these prerequisites ensure that students have the necessary backg

In [31]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [32]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [33]:
question = "Is probability a class topic?"
result = qa({"question": question})

In [34]:
result['answer']

'Yes, probability is a topic that will be covered in this class. The instructor assumes familiarity with basic probability and statistics, so it is expected that students have prior knowledge of random variables, expectation, variance, and other related concepts.'

In [35]:
question = "why are those prior knowledge needed?"
result = qa({"question": question})

In [36]:
result['answer']

'Prior knowledge of random variables, expectation, variance, and other related concepts is needed because these concepts are fundamental to understanding and working with probability and statistics. In machine learning and data analysis, these concepts are used to model and analyze data, make predictions, and evaluate the performance of algorithms. Without a solid understanding of these concepts, it would be difficult to interpret and analyze data, design and implement machine learning algorithms, and make informed decisions based on the results.'

In [37]:
question = "why are those prerequesites needed?"
result = qa({"question": question})
result['answer']

'Prior knowledge of random variables, expectation, variance, and other related concepts is needed because these concepts are fundamental to understanding and working with probability and statistics. In machine learning and data analysis, these concepts are used to model and analyze data, make predictions, and evaluate the performance of algorithms. Without a solid understanding of these concepts, it would be difficult to interpret and analyze data, design effective algorithms, and make informed decisions based on the results.'