In [None]:
pip install -r ../requirements.txt

In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [15]:
from langchain.document_loaders import PyPDFLoader

file_path = "../data/sdg.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()


In [18]:
len(data)

19

# Extract entire text from the PDF

In [19]:
pdf_text = ""
for page in data:
    pdf_text += page.page_content

In [20]:
print(pdf_text)



In [27]:
# Chunk the data now using TokenTextSplitter and GPT

from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(model_name="gpt-3.5-turbo", chunk_size=1000, chunk_overlap=100)



In [None]:
pdf_text_chunks = text_splitter.split_text(pdf_text)

In [31]:
len(pdf_text_chunks)
type(pdf_text_chunks[0])

str

In [32]:
# Convert the chunks text to Document objects
from langchain.docstore.document import Document

pdf_text_chunks = [Document(page_content=chunk) for chunk in pdf_text_chunks]

In [33]:
pdf_text_chunks

[Document(metadata={}, page_content='4th SDG Y outh Summer Camp – SDG Resource Document The 2030 Agenda for Sustainable Development’s 17 Sustainable Development Goals (SDGs)   Goal: This document enables 4th SDG Youth Summer Camp participants to i) get to know the 17 SDGs, ii) explore what areas each goal covers under its targets, iii) identify targets of most interest to participants, and iv) identify synergies between the SDGs and chosen target(s).    Goal 1. End poverty in all its forms everywhere  Target 1.1 By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than $1.25 a day  Target 1.2 By 2030, reduce at least by half the proportion of men, women and children of all ages living in poverty in all its dimensions according to national definitions  Target 1.3 Implement nationally appropriate social protection systems and measures for all, including floors, and by 2030 achieve substantial coverage of the poor and the vulnerable  Ta

In [34]:
type(pdf_text_chunks[0])

langchain_core.documents.base.Document

In [35]:
# Question generation pipeline

from langchain.chat_models import ChatOpenAI

llm_ques_gen_pipeline = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)




  llm_ques_gen_pipeline = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)


In [36]:
prompt = """
You are an expert at creating interview questions from a given text.
Your goal is to prepare a list of questions that are likely to be asked in an interview for the given text. 
You do this by first understanding the text and then creating a list of questions that are likely to be asked in an interview for the given text.

Here is the text:
----------
{text}
----------

Create a list of 10 questions that are likely to be asked in an interview for the given text. Make sure to not loose any important details.

QUESTIONS:
"""

In [37]:
# Convert to a prompt template

from langchain.prompts import PromptTemplate
prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

In [49]:
# Create a refine prompt
refine_prompt = """
    You are an expert at creating interview questions from a given text.
    Your goal is to prepare a list of questions that are likely to be asked in an interview for the given text. 
    You do this by first understanding the text and then creating a list of questions that are likely to be asked in an interview for the given text.
    
    We have received some practice questions to a certain extent: {existing_answer}

    We have the option to refine the existing questions further or add new ones (only if necessary) with some more context below:

    ----------
    {text}
    ----------
    
    Given the new context, refine the original 10 questions in English. 
    If the context is not helpful, please provide the original questions.

    QUESTIONS:

    """

In [50]:
# Create refine prompt template
from langchain.prompts import PromptTemplate
refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["existing_answer", "text"])

In [56]:
# Create a chain
# New type of chain - LoadSummarizeChain

from langchain.chains.summarize import load_summarize_chain

chain = load_summarize_chain(llm=llm_ques_gen_pipeline,
                             chain_type="refine",
                             verbose=True,
                             question_prompt=prompt_template,
                             refine_prompt=refine_prompt_template)

quen = chain.run(pdf_text_chunks)

# Create a chain



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating interview questions from a given text.
Your goal is to prepare a list of questions that are likely to be asked in an interview for the given text. 
You do this by first understanding the text and then creating a list of questions that are likely to be asked in an interview for the given text.

Here is the text:
----------
4th SDG Y outh Summer Camp – SDG Resource Document The 2030 Agenda for Sustainable Development’s 17 Sustainable Development Goals (SDGs)   Goal: This document enables 4th SDG Youth Summer Camp participants to i) get to know the 17 SDGs, ii) explore what areas each goal covers under its targets, iii) identify targets of most interest to participants, and iv) identify synergies between the SDGs and chosen target(s).    Goal 1. End poverty in all its forms everywhere  Target 1.1 By 2030, eradicate extreme p

In [59]:
questions = quen

# Store documents in a vector DB

In [54]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [55]:
# Initialize a vector DB - FAISS (Faster than chromaDB)
from langchain.vectorstores import FAISS

vector_store = FAISS.from_documents(pdf_text_chunks, embeddings) # It will store in memory;
# If need to store in a file, use FAISS.from_documents(pdf_text_chunks, embeddings, persist_directory="vector_store")
# If need to store in cloud, try using Pinecone 

In [60]:
llm_ans_gen = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.1)
questions_list = questions.split("\n")

In [None]:
questions_list

In [62]:
#Use RetrivalQA chain
from langchain.chains import RetrievalQA

answer_gen_chain = RetrievalQA.from_chain_type(
    llm=llm_ans_gen, chain_type="stuff", retriever=vector_store.as_retriever()
)


In [63]:
# Loop through each question, answer it using the chain above and save to a file

for question in questions_list:
    answer = answer_gen_chain.run(question)
    with open("answers.txt", "a") as f:
        f.write(f"Question: {question}\n")
        f.write(f"Answer: {answer}\n")
        f.write("\n")
        f.write("\n-----------------------------------------------\n")
        f.write("\n")
