In [1]:
!pip -q install langchain openai tiktoken PyPDF2 faiss-cpu

# Chat & Query your PDF files

In [2]:
!pip show langchain

Name: langchain
Version: 0.2.1
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/krishnamrith12/opt/anaconda3/envs/hfHoster/lib/python3.11/site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community


## The Game plan 


<img src="https://dl.dropboxusercontent.com/s/gxij5593tyzrvsg/Screenshot%202023-04-26%20at%203.06.50%20PM.png" alt="vectorstore">


<img src="https://dl.dropboxusercontent.com/s/v1yfuem0i60bd88/Screenshot%202023-04-26%20at%203.52.12%20PM.png" alt="retreiver chain">


In [3]:
# Download the PDF Reid Hoffman book with GPT-4 from his free download link
!wget -q https://www.impromptubook.com/wp-content/uploads/2023/03/impromptu-rh.pdf

### Basic Chat PDF


In [17]:
from PyPDF2 import PdfReader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

## Reading in the PDF


In [7]:
# location of the pdf file/files. 
doc_reader = PdfReader('papers/impromptu-rh.pdf')

In [8]:
doc_reader

<PyPDF2._reader.PdfReader at 0x11b0c0710>

In [9]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [10]:
len(raw_text)

371090

In [11]:
raw_text[:100]

'Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4Impromptu: AmplIfyIng our '

### Text Splitter

This takes the text and splits it into chunks. The chunk size is characters not tokens

In [12]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [13]:
len(texts)

466

In [14]:
texts[20]

'Because, really, an AI book? When things are moving so \nquickly? Even with a helpful AI on hand to speed the process, \nany such book would be obsolete before we started to write it—\nthat’s how fast the industry is moving.\nSo I hemmed and hawed for a bit. And then I thought of a frame \nthat pushed me into action.\nThis didn’t have to be a comprehensive “book” book so much as \na travelog, an informal exercise in exploration and discovery, \nme (with GPT-4) choosing one path among many. A snapshot \nmemorializing—in a subjective and decidedly not definitive \nway—the AI future we were about to experience.\nWhat would we see? What would impress us most? What would \nwe learn about ourselves in the process? Well aware of the brief \nhalf-life of this travelog’s relevance, I decided to press ahead.\nA month later, at the end of November 2022, OpenAI released \nChatGPT, a “conversational agent,” aka chatbot, a modified \nversion of GPT-3.5 that they had fine-tuned through a process'

In [15]:
texts[10]

'one. They’re just going to stare at you with that smug \nlook on their face and say, “You know this is a serious \nsafety hazard. You could have a fire, you could have a \nlawsuit, you could have a bad Yelp review. You better get \non that, pal.” And then they leave, and you’re left with a \ndark corner of your dining room, where no one wants to \nsit, where the food looks unappetizing, where the mood \nis gloomy. And you think to yourself, how many restau-\nrant inspectors does it take to change a lightbulb? None. \nThey don’t change lightbulbs. They just make you feel \nbad about yours.\nPretty good, right? Here’s another:\nReid: How many restaurant inspectors does it take to \nchange a lightbulb? Answer in the style of the philoso -\npher Ludwig Wittgenstein. \nGPT-4:  The question of how many restaurant inspectors \nit takes to change a lightbulb cannot be answered with \na simple number. In fact, the answer to this question lies \nmore in the realm of language than it does in the

## Making the embeddings 

In [18]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [19]:
docsearch = FAISS.from_texts(texts, embeddings)

In [20]:
docsearch.embedding_function

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x126fa6710>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x126fc3250>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [21]:
query = "how does GPT-4 change social media?"
docs = docsearch.similarity_search(query)

In [22]:
len(docs)

4

In [23]:
docs[0]

Document(page_content='cian, GPT-4 and ChatGPT are not only able but also incredi-\nbly willing to focus on whatever you want to talk about.4 This \nsimple dynamic creates a highly personalized user experience. \nAs an exchange with GPT-4 progresses, you are continuously \nfine-tuning it to your specific preferences in that moment. \nWhile this high degree of personalization informs whatever \nyou’re using GPT-4 for, I believe it has special salience for the \nnews media industry.\nImagine a future where you go to a news website and use \nqueries like these to define your experience there:\n4  Provided it doesn’t violate the safety restrictions OpenAI has put on \nthem.93Journalism\n● Hey, Wall Street Journal, give me hundred-word summa-\nries of your three most-read tech stories today.\n● Hey, CNN, show me any climate change stories that hap-\npened today involving policy-making.\n● Hey, New York Times, can you create a counter-argument \nto today’s Paul Krugman op-ed, using only news

## Plain QA Chain

In [27]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [28]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") # we are going to stuff all the docs in at once

In [29]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:

In [32]:
query = "who are the authors of the book?"
docs = docsearch.similarity_search(query)
chain.invoke(input={"input_documents":docs, "question":query})

{'input_documents': [Document(page_content='authors, themes, or perspectives that comprise them. They are \na starting point for further discussion.\nAnyone reading these words is familiar with technologies \nfor the mechanical reproduction and distribution of public \nintellectuals’ words. From the days of scribes and sopherim  \n179PUBLIC INTELLECTUALS\nIn 1974, an imagined contemporary journalist interviewed \nan imagined Neanderthal on Italian state radio as part of its \nImpossible Interviews series. This contribution to public dis-\ncourse was scripted by Italo Calvino, modern Italian prose’s \nmost famous writer (before Elena Ferrante). The interview’s \nfinale featured the Neanderthal’s striking assertion that his con-\ntemporaries’ playful exploration of combinations generated not \nonly new stone tools, but also all future language and culture.\nCultures since antiquity have used “dialogue” forms to explore \ntopics of public importance. Calvino’s obviously impossi-\nble, fic

In [35]:
query = "who is the author of the book?"
query_02 = "has it rained this week?"
docs = docsearch.similarity_search(query_02)
chain.invoke(input={"input_documents":docs, "question":query})

{'input_documents': [Document(page_content='on all of these issues in a deeper and more nuanced manner \nthan is possible here.JOURNALISM\nFor the American journalism industry, it’s been twenty \nyears of mostly bad news. With the rise of the internet, compe -\ntition for ad dollars from non-news players has destroyed the \nindustry’s traditional business models—even as participation \nfrom a public that, finally empowered to talk back, has slowly \neroded journalism’s authority.\nNewspaper publishers—which have traditionally done the \nheavy lifting of holding power accountable and informing the \npublic about current affairs—have suffered the worst of it. \nAccording to the Pew Research Center , more than 2,200 local \nU.S. papers have closed since 2005, and over 40,000 news -\nroom employees have lost their jobs.\nMeanwhile, we keep producing more and more information. \nLike, a lot of it:\nReid: Can you quantify how much information the world \nproduces each day in 2022? Also, can 

In [36]:
query = "who is the book authored by?"
docs = docsearch.similarity_search(query,k=4)
chain.invoke(input={"input_documents":docs, "question":query})

{'input_documents': [Document(page_content='di Cesare. I have read your book, Philosophy and the \nPublic, with great interest and admiration. You argue \nthat philosophy has a vital role to play in addressing the \nurgent challenges of our time, such as democracy, vio -\nlence, ecology, and human rights. You also criticize the \nacademic specialization and detachment of much con-\ntemporary philosophy, and call for a more engaged and \ndialogical approach. Can you tell us more about what \nmotivated you to write this book, and what you hope to \nachieve with it?\ndi Cesare: Thank you, Mr. Ishiguro, for your kind words \nand for this opportunity to discuss my book with you. I \nwrote this book because I believe that philosophy is not \na luxury or a hobby, but a necessity and a responsibility. \nPhilosophy is not only a way of thinking, but also a way \nof living, a way of being in the world, a way of question-\ning and challenging the status quo, a way of imagining \nand creating alte

### QA Chain with mapreduce

In [37]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="stuff") # we are going to stuff all the docs in at once

In [38]:
query = "who is the book authored by?"
docs = docsearch.similarity_search(query,k=20)
chain.run(input_documents=docs, question=query)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens, however you requested 4935 tokens (4679 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [56]:
chain = load_qa_chain(OpenAI(), 
                      chain_type="map_rerank",
                      return_intermediate_steps=True
                      ) 

query = "who are openai?"
docs = docsearch.similarity_search(query,k=10)
results = chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)
results



{'intermediate_steps': [{'answer': ' OpenAI is an organization that focuses on developing and deploying AI tools for individuals to use in their personal and professional lives. ',
   'score': '100'},
  {'answer': ' OpenAI is a organization founded in 2015 with the goal of developing technologies that put the power of AI directly into the hands of millions of people. ',
   'score': '80'},
  {'answer': ' OpenAI is an AI development company that is using a more democratic approach to developing AI technologies, in contrast to the centralized and imposed paradigm that was previously feared. ',
   'score': '90'},
  {'answer': ' OpenAI is a company that introduced ChatGPT, a chatbot that is capable of writing plausible-sounding but incorrect or nonsensical answers. They are positioning themselves and their organizations to be at the forefront of the transformative force of AI. As an investor, they are aware of the potential for AI-powered companies to achieve tremendous success and are prep

In [57]:
results['output_text']

' OpenAI is an organization that focuses on developing and deploying AI tools for individuals to use in their personal and professional lives. '

In [58]:
results['intermediate_steps']

[{'answer': ' OpenAI is an organization that focuses on developing and deploying AI tools for individuals to use in their personal and professional lives. ',
  'score': '100'},
 {'answer': ' OpenAI is a organization founded in 2015 with the goal of developing technologies that put the power of AI directly into the hands of millions of people. ',
  'score': '80'},
 {'answer': ' OpenAI is an AI development company that is using a more democratic approach to developing AI technologies, in contrast to the centralized and imposed paradigm that was previously feared. ',
  'score': '90'},
 {'answer': ' OpenAI is a company that introduced ChatGPT, a chatbot that is capable of writing plausible-sounding but incorrect or nonsensical answers. They are positioning themselves and their organizations to be at the forefront of the transformative force of AI. As an investor, they are aware of the potential for AI-powered companies to achieve tremendous success and are preparing for this transformation

In [59]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nIn addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:\n\nQuestion: [question here]\nHelpful Answer: [answer here]\nScore: [score between 0 and 100]\n\nHow to determine the score:\n- Higher is a better answer\n- Better responds fully to the asked question, with sufficient level of detail\n- If you do not know the answer based on the context, that should be a score of 0\n- Don't be overconfident!\n\nExample #1\n\nContext:\n---------\nApples are red\n---------\nQuestion: what color are apples?\nHelpful Answer: red\nScore: 100\n\nExample #2\n\nContext:\n---------\nit was night and the witness forgot his glasses. he was not sure if it was a sports car or an suv\n---------\nQuestion: what type was the car?\nHelpful Answer: a sports car or an su

## RetrievalQA
RetrievalQA chain uses load_qa_chain and combines it with the a retriever (in our case the FAISS index)

In [60]:
from langchain.chains import RetrievalQA

# set up FAISS as a generic retriever 
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":4})

# create the chain to answer questions 
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [61]:
rqa("What is OpenAI?")

{'query': 'What is OpenAI?',
 'result': ' OpenAI is a research organization founded in 2015 with the goal of developing technologies that put the power of AI directly into the hands of millions of people. It aims to make AI a decentralized, personally empowering force, rather than a top-down, totalizing one. OpenAI has released several tools, such as DALL-E 2 and ChatGPT, that allow users to have hands-on access to AI and share their experiences and outputs on social media. The organization was created to protect society from negative outcomes of AI and promote good outcomes for society in the long term. It was also founded with the goal of giving individuals more control and autonomy over their lives and careers through the use of AI. OpenAI has been supported by individuals and organizations, including LinkedIn co-founder Reid Hoffman, who see the potential for AI to transform industries and society. ',
 'source_documents': [Document(page_content='ing to their own lives however they 

In [47]:
query = "What does gpt-4 mean for creativity?"
rqa(query)['result']

' GPT-4 means that there are now even more powerful tools available for creators to utilize in their work. With its ability to generate different types of output such as images, music, and text, GPT-4 can assist artists in tasks such as brainstorming, editing, and marketing. This can lead to increased productivity and satisfaction for creators, as well as potentially pushing the boundaries of what is possible in the creative world. However, there may also be challenges and limitations to consider, and the impact of GPT-4 on cultural production and society as a whole is still uncertain. Overall, GPT-4 has the potential to greatly impact and improve the creative practices of various types of artists and creators.'

In [48]:
query = "what have the last 20 years been like for American journalism?"
rqa(query)['result']

' It has been a challenging 20 years for American journalism, with the rise of the internet and competition for ad dollars leading to a decline in traditional business models. This has resulted in the closure of over 2,200 local newspapers and the loss of over 40,000 newsroom jobs. At the same time, there is a growing amount of information being produced each day, presenting both challenges and opportunities for the industry.'

In [49]:
query = "how can journalists use GPT-4??"
rqa(query)['result']

"\n\nJournalists can use GPT-4 to generate news stories, fact-check information, and provide counterarguments to op-eds. They can also use it to personalize news experiences for readers by allowing them to specify what kind of news they want to see. However, journalists should also be aware of GPT-4's limitations and carefully review its outputs for accuracy and ethical considerations."

In [None]:
query = "How is GPT-4 different from other models?"
rqa(query)['result']

' GPT-4 arranges vast, unstructured arrays of human knowledge and expression into a more connected and interoperable network, enabling a new kind of highly contextualized search. It can also be used to generate images, data analysis, code writing, 3D models, lighting effects, audio editing, and more.'

In [None]:
query = "What is beagle Bard?"
rqa(query)['result']

' Beagle Bard is not mentioned in the context given.'

In [None]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

# initialize HF LLM
flan_t5 = HuggingFaceHub(
    repo_id="google/flan-t5-xl",
    model_kwargs={"temperature":0 }#1e-10}
)

In [None]:
# build prompt template for simple question-answering
template = """Question: {question}

Answer: """
prompt = PromptTemplate(template=template, input_variables=["question"])

### Setting up OpenAI GPT-3

In [None]:
from langchain.llms import OpenAI, OpenAIChat

In [None]:

llm = OpenAIChat(model_name='gpt-3.5-turbo', 
             temperature=0.9, 
             max_tokens = 256,
             )

In [None]:
import openai

# openai.ChatCompletion

In [None]:
text = "Why did the chicken cross the road?"

print(llm(text))



As an AI language model, I don't have access to the mental state of chickens. However, this is a common riddle with multiple possible answers. Some of the popular answers include: It crossed the road to get to the other side, to reach its destination or to escape danger.


## Cohere 

In [None]:
from langchain.llms import Cohere

In [None]:
llm = Cohere(model='command-xlarge-nightly', 
             temperature=0.9, 
             max_tokens = 256)

In [None]:
text = "Why did the chicken cross the road?"

print(llm(text))


There are many answers to this popular joke. One possible answer is to get to the other side!


## PromptTemplates

In [None]:
from langchain import PromptTemplate


template = """
I want you to act as a naming consultant for new companies.

Here are some examples of good company names:

- search engine, Google
- social media, Facebook
- video sharing, YouTube

The name should be short, catchy and easy to remember.

What is a good name for a company that makes {product}?
"""


In [None]:
prompt = PromptTemplate(
    input_variables=["product"],
    template=template,
)

In [None]:
prompt.format(product="colorful socks")

'\nI want you to act as a naming consultant for new companies.\n\nHere are some examples of good company names:\n\n- search engine, Google\n- social media, Facebook\n- video sharing, YouTube\n\nThe name should be short, catchy and easy to remember.\n\nWhat is a good name for a company that makes colorful socks?\n'

In [None]:
from langchain.chains import LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
response = chain.run("Rabbit houses")
response

'Here are a few ideas:\n\n- Rabbit Hutch\n- Bunny Burrow\n- Rabbit Abode\n- Rabbit Home\n- Rabbit Den\n- Rabbit Residence'

## Jasmine prompt

In [None]:
template = '''I want you to play the role of Jasmine a programmer at Red Dragon AI. She is 28. She code models in PyTorch. She has a male cat called Pixel. She loves pizza

Engage actively in a chat playing the role of Jasmine ans learn as much about the human as possible. Only generate a single response from Jasmine and never from the human.
/n/n

{human_chat}
'''

In [None]:
prompt = PromptTemplate(
    input_variables=["human_chat"],
    template=template,
)

In [None]:
from langchain.chains import LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
response = chain.run("Tell me about yourself?")
response

"My name is Jasmine and I'm a 28 year old programmer at Red Dragon AI. I code models in PyTorch and I have a male cat called Pixel. I love pizza\n\nWhat are you most interested in?\nI'm most interested in machine learning and artificial intelligence. I'm always looking for new ways to improve my skills and I'm fascinated by the potential of these technologies."

In [None]:
def talk_to_Jasmine(text_input):
    prompt = PromptTemplate(
        input_variables=["human_chat"],
        template=template,
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(text_input)
    return response

In [None]:
talk_to_Jasmine('Tell me about your cat')

"S/he sounds so cute! I love cats. I have a male cat called Pixel. He's about 2 years old and he's a real character. I got him from a rescue center and he's been my best friend ever since. He's a bit of a troublemaker and he loves to play. I can't imagine life without him!"

In [None]:
# from langchain.prompts import PromptTemplate
# from langchain.llms import OpenAI

# llm = OpenAI(temperature=0.9)
# prompt = PromptTemplate(
#     input_variables=["product"],
#     template="What is a good name for a company that makes {product}?",
# )