In [1]:
from datasets import load_dataset

In [2]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
# Taking a sample of 100 rows
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()

In [4]:
from dotenv import load_dotenv, find_dotenv
import openai

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
openai.Model.list()

<OpenAIObject list at 0x168af09a0> JSON: {
  "object": "list",
  "data": [
    {
      "id": "gpt-3.5-turbo-0301",
      "object": "model",
      "created": 1677649963,
      "owned_by": "openai"
    },
    {
      "id": "text-embedding-3-large",
      "object": "model",
      "created": 1705953180,
      "owned_by": "system"
    },
    {
      "id": "gpt-4-turbo-preview",
      "object": "model",
      "created": 1706037777,
      "owned_by": "system"
    },
    {
      "id": "dall-e-3",
      "object": "model",
      "created": 1698785189,
      "owned_by": "system"
    },
    {
      "id": "whisper-1",
      "object": "model",
      "created": 1677532384,
      "owned_by": "openai-internal"
    },
    {
      "id": "dall-e-2",
      "object": "model",
      "created": 1698798177,
      "owned_by": "system"
    },
    {
      "id": "text-embedding-ada-002",
      "object": "model",
      "created": 1671217299,
      "owned_by": "openai-internal"
    },
    {
      "id": "tts-1-hd-110

In [6]:
import pandas as pd

In [7]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [8]:
# print the title, description, and label of each example
for idx, row in xsum_sample.sample(2).iterrows():
    print("")
    print(f"Document: {row['document']}")
    print(f"Summary: {row['summary']}")


Document: Is there something you have seen or heard that you would like us to investigate?
It could be a burning issue or something you have always wondered about the area or its people.
Use the tool below to send us your questions.
We could be in touch and your question could make the news.
Summary: Have you got a question about Coventry & Warwickshire?

Document: A report by technology news site Gizmodo said staff responsible for what was shown to Facebook's 1.6bn users frequently chose to bury articles they did not agree with.
Responding to the allegations, the network's head of search Tom Stocky wrote that the site "found no evidence that the anonymous allegations are true".
The claims come weeks after Facebook founder Mark Zuckerberg publicly denounced the policies of likely US presidential nominee, Donald Trump.
"I hear fearful voices calling for building walls and distancing people they label as 'others.'," the 31-year-old said at his firm's recent developers conference.
Howeve

In [9]:
xsum_sample["combined_info"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [11]:
import tiktoken

embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)

# omit descriptions that are too long to embed
xsum_sample["n_tokens"] = xsum_sample.combined_info.apply(lambda x: len(encoding.encode(x)))
xsum_sample = xsum_sample[xsum_sample.n_tokens <= max_tokens]

In [12]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id,combined_info,n_tokens
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,Document: The full cost of damage in Newton St...,510
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,Document: A fire alarm went off at the Holiday...,198


### Generate Embedding

In [13]:
from openai.embeddings_utils import get_embedding

In [14]:
embedding_model = "text-embedding-3-small"
xsum_sample["embedding"] = xsum_sample.combined_info.apply(lambda x: get_embedding(x, engine=embedding_model))

In [17]:
xsum_sample.rename(columns = {'embedding': 'vector'}, inplace = True)
xsum_sample.rename(columns = {'combined_info': 'text'}, inplace = True)
xsum_sample.to_pickle('data/xsum_sample.pkl')

In [18]:
xsum_sample.to_csv("data/xsum_sample_1k.csv")

### Readin Files with Embedding

In [29]:
import pandas as pd
xsum_sample = pd.read_pickle('data/xsum_sample.pkl')

In [31]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id,text,n_tokens,vector
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,Document: The full cost of damage in Newton St...,510,"[-0.002085343236103654, 0.05560915172100067, 0..."
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,Document: A fire alarm went off at the Holiday...,198,"[-0.020631801337003708, -0.023478450253605843,..."


In [32]:
import lancedb
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import LanceDB
from langchain.chains import RetrievalQA

In [33]:
import lancedb

# connect to a database
uri = "dataset/xsum-sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("xsum_sample", xsum_sample)

In [34]:
print(db.table_names())

['xsum_sample']


In [35]:
# openai_api_key = os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
docsearch = LanceDB(connection = table, embedding = embeddings)

In [36]:
query = "I'm looking for the information cost of damage in Newton Stewart. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)
docs

[Document(page_content='Document: The semi-official Iranian Students News Agency (Isna) said the fire was in an "explosive materials production unit".\nA pro-opposition website reported a huge blast near the Parchin military site, south-east of the capital, but this was not confirmed.\nParchin has been linked to Iran\'s controversial nuclear programme.\nThe International Atomic Energy Agency (IAEA) has not been given access to the complex since 2005.\nAnalysts say the IAEA suspects Iran of experimenting with explosives capable of triggering a nuclear weapon at Parchin.\nThe pro-reform website Sahamnews said the explosion on Sunday evening was so intense that windows of buildings 15km (nine miles) away were shattered.\nThe glare from the blast could also be seen from a great distance, the report added.\nIsna, quoting Iran\'s defence industries organisation, said: "Unfortunately, due to the incident, two workers of this production unit lost their lives."\nIt gave no further details.\nLas

### Build the chain

- Setup: Import packages.
- LLM Agent: Build an agent that leverages a modified version of the ReAct framework to do chain-of-thought reasoning.
- LLM Agent with History: Provide the LLM with access to previous steps in the conversation.

In [49]:
# LLM wrapper
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [37]:
# Initiate our LLM - default is 'gpt-3.5-turbo'
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

In [70]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({"query": query})

In [71]:
result['result']

'According to recent reports, Spotify is considering a direct listing on a stock exchange, which would allow the company to become publicly listed without raising new cash. This would save Spotify from underwriting fees and avoid diluting the value of existing stakes in the company. A direct listing would also mean that shares would be traded on the day of listing, with the price based on supply and demand. Spotify has not yet commented on these reports.'

In [67]:
result['source_documents'][0]

Document(page_content='Document: Kromtech - the Germany-based firm behind MacKeeper - has acknowledged that its customers\' names, internet addresses and login credentials were among the data  exposed.\nHowever, it said that users\' payment details were "never at risk".\nThe firm believes the details were accessed only by the security expert who alerted it to the problem.\n"The privacy and security of our clients\' information remains our top priority and from the moment we were aware of the access, we immediately took several proactive steps to identify and correct the issue," it said in a statement.\n"We want to offer a special \'thank you\' to security researcher Chris Vickery for identifying the security breach attempt so that we could stop it before anyone was harmed."\nMr Vickery told security blogger Brian Krebs that he had discovered 21 gigabytes of MacKeeper user data "after spending a few bored moments searching for database servers" that were not password-protected. He was a

In [68]:
df_filtered = xsum_sample[xsum_sample['summary'].apply(lambda x: 'news' in x)]

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'data': df_filtered}), return_source_documents=True)

query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({"query": query})
result['result']

'According to recent reports, Spotify is considering a direct listing on a stock exchange, which would allow the company to become publicly listed without raising new cash. This approach would save Spotify from underwriting fees and avoid diluting the value of existing stakes in the company. A direct listing would also mean that shares would be traded on the day of listing, with the price based on supply and demand. This tactic is usually used by smaller companies that do not expect high levels of trading in their stock. However, it is important to note that Spotify has not made an official statement regarding this matter.'

In [69]:
from langchain.prompts import PromptTemplate

template = """You are a news recommender system that help users to find useful information. 
Use the following pieces of context to answer the question at the end. 
For each question, suggest three news, with a short description of the news and the reason why the user migth like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({'query':query})
print(result['result'])

1. News: Spotify considering direct listing on stock market
   Description: This news article discusses Spotify's potential plan to go public through a direct listing on a stock market, rather than a traditional initial public offering (IPO). It explains the benefits of a direct listing, such as avoiding underwriting fees and maintaining the value of existing stakes in the company. This news would be relevant to someone interested in the financial aspects of Spotify's business strategy.

2. News: Spotify signs new licensing deal with Universal Music Group
   Description: This news article highlights Spotify's recent long-term licensing deal with Universal Music Group, the world's largest record label. It discusses the significance of this deal for Spotify's music streaming service and its relationship with major music labels. This news would be of interest to someone following Spotify's partnerships and developments in the music industry.

3. News: Spotify's potential impact on the mus