In [9]:
from datasets import load_dataset

In [10]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [11]:
# Taking a sample of 100 rows
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
# Combining 'document' and 'summary' columns
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)        

In [12]:
# print the title, description, and label of each example
for idx, row in xsum_sample.sample(2).iterrows():
    print("")
    print(f"Document: {row['document']}")
    print(f"Summary: {row['summary']}")


Document: The firm told a judicial review it did not accept the discharge of final treated effluent had degraded the quality of Llyn Padarn at Llanberis.
Anglers claim water quality threatens the future of the Arctic char fish.
A High Court judge reserved judgement in the case at Caernarfon on Wednesday.
The judicial review proceedings have been brought by the Seiont, Gwyrfai, and Llyfni Anglers Society, represented by a body called Fish Legal - an umbrella organisation for fishing groups.
Lawyers claimed Natural Resources Wales (NRW) had not done enough to protect the Arctic char, primarily from raw sewage and treated effluent discharged by Welsh Water.
The company said since 2010 it had invested £3.6m at the works, with another £4.6m agreed to be spent by summer 2016.
It said the work completed meant the lake met water quality standards necessary to be awarded bathing water status by the Welsh government.
"It is not accepted that the discharge of final treated effluent has degraded 

### Generate Embedding

In [16]:
import os
from dotenv import load_dotenv, find_dotenv
import openai
from openai.embeddings_utils import get_embedding

_ = load_dotenv(find_dotenv())  
openai.api_key = os.getenv('OPENAI_API_KEY')

In [17]:
embedding_model = "text-embedding-3-small"
xsum_sample["embedding"] = xsum_sample.combined.apply(lambda x: get_embedding(x, engine=embedding_model))

In [18]:
xsum_sample.rename(columns = {'embedding': 'vector'}, inplace = True)
xsum_sample.rename(columns = {'combined': 'text'}, inplace = True)
xsum_sample.to_pickle('xsum_sample.pkl')

In [18]:
# xsum_sample.to_csv("data/xsum_sample_1k.csv")

### Readin Files with Embedding

In [32]:
import pandas as pd
xsum_sample = pd.read_pickle('xsum_sample.pkl')

In [33]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id,text,n_tokens,vector
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,Document: The full cost of damage in Newton St...,510,"[-0.002085343236103654, 0.05560915172100067, 0..."
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,Document: A fire alarm went off at the Holiday...,198,"[-0.020631801337003708, -0.023478450253605843,..."


In [20]:
import lancedb

# connect to a database
# uri = "xsum-sample-lancedb"
uri = "xsum-sample-lancedb_v2"
db = lancedb.connect(uri)
table = db.create_table("xsum_sampled_1k", xsum_sample)

In [21]:
print(db.table_names())

['xsum_sampled_1k']


In [22]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import LanceDB

openai_api_key = os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key )
docsearch = LanceDB(connection = table, embedding = embeddings)

In [23]:
query = "I am looking for Harry Potter information. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)
docs

[Document(page_content='Document: Gary Fung announced the settlement with Music Canada via a blog published at the weekend.\nIsohunt was shut down in 2013, when Mr Fung agreed to pay $110m to the Motion Picture Association of America (MPAA).\nOne researcher said the cases could set a "worrying" precedent for those who run sites that may link to pirated content.\nA court order associated with the decision details the fees as follows: 55m Canadian dollars in damages, C$10m in "punitive, exemplary and aggravated damages" and a further C$1m to cover legal costs.\nThe total amounts to 50m US dollars.\nThe case dates from a legal order sent in May 2008 by the Canadian Recording Industry Association (CRIA), now known as Music Canada.\nPreviously, Mr Fung had promised users that he would not disclose their data - including email and IP addresses - during legal proceedings.\n"I\'ve kept my word regarding users\' privacy," he wrote.\nIsohunt did not host pirated media, but rather provided users 

### Build the chain

- Setup: Import packages.
- LLM Agent: Build an agent that leverages a modified version of the ReAct framework to do chain-of-thought reasoning.
- LLM Agent with History: Provide the LLM with access to previous steps in the conversation.

In [24]:
# LLM wrapper
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [26]:
# Initiate our LLM - default is 'gpt-3.5-turbo'
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

In [31]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

query = "I'm looking for information on Harry Potter. What would you recommend?"
result = qa({"query": query})

In [32]:
result['result']

"I don't have information on Harry Potter in the provided context."

In [74]:
query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({"query": query})

In [29]:
result['result']

"I don't have any information on Harry Potter."

In [75]:
result['source_documents'][0]

Document(page_content='Document: Kromtech - the Germany-based firm behind MacKeeper - has acknowledged that its customers\' names, internet addresses and login credentials were among the data  exposed.\nHowever, it said that users\' payment details were "never at risk".\nThe firm believes the details were accessed only by the security expert who alerted it to the problem.\n"The privacy and security of our clients\' information remains our top priority and from the moment we were aware of the access, we immediately took several proactive steps to identify and correct the issue," it said in a statement.\n"We want to offer a special \'thank you\' to security researcher Chris Vickery for identifying the security breach attempt so that we could stop it before anyone was harmed."\nMr Vickery told security blogger Brian Krebs that he had discovered 21 gigabytes of MacKeeper user data "after spending a few bored moments searching for database servers" that were not password-protected. He was a

In [78]:
df_filtered = xsum_sample[xsum_sample['summary'].apply(lambda x: 'news' in x)]

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'data': df_filtered}), return_source_documents=True)

query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({"query": query})
result['result']

'Spotify is reportedly considering an unconventional direct listing on a stock market that stops short of a full-blown initial public offering. This means that Spotify might register shares on a stock exchange and become a publicly listed company without raising new cash. This tactic would save the music service underwriting fees needed for an IPO and avoid diluting the value of existing stakes in the company. It could also help Spotify avoid a surge in first-day trading that often takes place after an IPO. This approach is rare and is usually used by smaller companies that do not expect high levels of trading in their stock.'

In [79]:
from langchain.prompts import PromptTemplate

template = """You are a news recommender system that help users to find useful information. 
Use the following pieces of context to answer the question at the end. 
For each question, suggest three news, with a short description of the news and the reason why the user migth like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for the information of Spotify. What could you suggest to me?"
result = qa({'query':query})
print(result['result'])

1. News: Spotify considering direct listing on stock market
   Description: This news article discusses Spotify's potential plan to go public through a direct listing on a stock market, rather than a traditional initial public offering (IPO). It explains the benefits of a direct listing, such as avoiding underwriting fees and maintaining the value of existing stakes in the company. This news would be relevant to someone interested in the financial aspects of Spotify's future and its potential impact on the music industry.

2. News: Spotify signs new long-term licensing deal with Universal Music Group
   Description: This news article highlights Spotify's recent agreement with Universal Music Group, the world's largest record label. It discusses the significance of this deal for Spotify's music streaming service and its relationship with major music labels. This news would be of interest to someone following Spotify's business partnerships and its efforts to secure licensing agreements 