In [79]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import pandas as pd

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain

load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [7]:
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
)

vectorstore = FAISS.load_local("arxiv_vectorstore", embeddings=embed)

In [18]:
query = "how can BERT be trained to optimize the number of parameters used?"

# Similarity search
test_search = vectorstore.similarity_search_with_score(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

# How to extract text content and metadata from the first result returned by the similarity search
first_chunk = test_search[0][0].page_content
first_metadata = test_search[0][0].metadata

# Papers from where the first 3 chunks were taken out of
for doc in test_search:
    print(doc[0].metadata['title'])

ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
RoBERTa: A Robustly Optimized BERT Pretraining Approach


In [23]:
query = "what research has been done in the field of LLM tool usage?"

test_search2 = vectorstore.similarity_search_with_score(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

# Papers from where the first 3 chunks were taken out of
for doc in test_search2:
    print(doc[0].metadata['title'])

Large Language Models as Tool Makers
Gorilla: Large Language Model Connected with Massive APIs
A Survey of Large Language Models


In [97]:
# Prompts - context, examples (if fewshot), task (prompt action, for example "Answer based on this user input: {user_input}"). Additional variables can be added to a prompt

llm = OpenAI(model_name="text-davinci-003", temperature=0)

# Stage 1 - similarity search of the user input against the chunks stored in our FAISS database
user_input = "What is BERT?"
search = vectorstore.similarity_search_with_score(
    user_input,  # our search query
    k=3  # return 3 most relevant docs
)

# Stage 2 - summarize data of selected chunks in order to more easily fit into a context window
input_template1 = """
You are an AI assistant designed to excel in the task of summarizing chunks of academic papers in the are of Computer Science and Artificial Intelligence.
Be brief in your summarization. This summary will be used to decide if the chunk of text is useful to answer a specific question.

Summarize the following chunk of text: {chunk}.
Summary:
"""
prompt_template1 = PromptTemplate(
    input_variables=["chunk"],
    template=input_template1
)

chain1 = LLMChain(llm=llm, prompt=prompt_template1)

summary_data = []
for doc in search:
    answer = chain1.run({'chunk': doc[0].page_content})
    metadata = doc[0].metadata
    data_to_append = {
        'llm_summary': answer,
        'metadata': metadata
    }
    summary_data.append(data_to_append)

In [None]:
# Stage 3 - feed summary into another chain -> goal is for another llm to figure out how useful each summary in answering the initial user input - keep chosen dicts (chunk summary + metadata)
input_template1 = """
You are an AI assistant designed to excel in the task of summarizing chunks of academic papers in the are of Computer Science and Artificial Intelligence.
Be brief in your summarization. This summary will be used to decide if the chunk of text is useful to answer a specific question.

Summarize the following chunk of text: {chunk}.
Summary:
"""
prompt_template1 = PromptTemplate(
    input_variables=["chunk"],
    template=input_template1
)

In [None]:
# With the filtered chunk summaries added to the prompt context, answer the user's question

# Based on filtered chunk's summary and metadata, use APIs (Wikipedia, Wolfram Alpha, Youtube) to find relevant content - build an agent if time allows

# chat_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# template="You are a helpful assistant that helps people with learning about a variety of topics centered around {topic1} and {topic2}."
# system_message_prompt = SystemMessagePromptTemplate.from_template(template)
# human_template="{text}"
# human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

# chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
# chat_chain = LLMChain(llm=chat_llm, prompt=chat_prompt)

# chat_chain({"topic1": "CS", "topic2": "AI", "text": user_input})

In [98]:
summary_data

[{'llm_summary': "This text discusses the use of Electra, an AI model based on BERT, for summarizing chunks of academic papers in the field of Computer Science and Artificial Intelligence. It extends the BERT dataset to 33 billion tokens by including data from Clueweb, CommonCrawl, and Gigaword. The model architecture and most hyperparameters are the same as BERT's, with simple linear classifiers added for GLUE and a question-answering module from XLNet added for SQUAD. Weight sharing is proposed to improve the efficiency of the pre-training, with the input and output token embeddings of the generator always tied as in BERT. Results show that GLUE scores are 83.6 with no weight tying.",
  'metadata': {'chunk_id': 6,
   'authors': ['Kevin Clark',
    'Minh-Thang Luong',
    'Quoc V. Le',
    'Christopher D. Manning'],
   'pdf_url': 'http://arxiv.org/pdf/2003.10555',
   'summary': 'masked language modeling mlm pre training methods such as bert corrupt the input by replacing some tokens w

In [None]:
# Memory

# Chains

# Agent

# (need to answer user input and look for resources based on the metadata returned)

In [44]:
qa = RetrievalQAWithSourcesChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    # chain_type="stuff",
    # document_prompt=,
    # question_prompt=,
    # combine_prompt=
)

In [48]:
from langchain.agents import Tool
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent
from langchain.agents import AgentType

tools = [
    Tool.from_function(
        name = "Search",
        func=qa.__call__, #chain doesn't work anymore because of some inspection issue and there is no call function
        coroutine=qa.acall, #if you want to use async
        description="useful to answer factual questions"
    ),
]
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
llm=ChatOpenAI(temperature=0)
agent_chain = initialize_agent(tools, llm, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, verbose=True, memory=memory)
agent_chain.run(input="Some Question")



[1m> Entering new  chain...[0m
[32;1m[1;3m{
    "action": "Search",
    "action_input": "Some Question"
}[0m

ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].

In [None]:
agent_chain.run(input="Some follow-up Question")

In [46]:
qa.__call__(query)

ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].