In [1]:
from datasets import load_dataset

In [2]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
# Taking a sample of 100 rows
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()

In [4]:
from dotenv import load_dotenv, find_dotenv
import openai

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
openai.Model.list()

<OpenAIObject list at 0x168af09a0> JSON: {
  "object": "list",
  "data": [
    {
      "id": "gpt-3.5-turbo-0301",
      "object": "model",
      "created": 1677649963,
      "owned_by": "openai"
    },
    {
      "id": "text-embedding-3-large",
      "object": "model",
      "created": 1705953180,
      "owned_by": "system"
    },
    {
      "id": "gpt-4-turbo-preview",
      "object": "model",
      "created": 1706037777,
      "owned_by": "system"
    },
    {
      "id": "dall-e-3",
      "object": "model",
      "created": 1698785189,
      "owned_by": "system"
    },
    {
      "id": "whisper-1",
      "object": "model",
      "created": 1677532384,
      "owned_by": "openai-internal"
    },
    {
      "id": "dall-e-2",
      "object": "model",
      "created": 1698798177,
      "owned_by": "system"
    },
    {
      "id": "text-embedding-ada-002",
      "object": "model",
      "created": 1671217299,
      "owned_by": "openai-internal"
    },
    {
      "id": "tts-1-hd-110

In [6]:
import pandas as pd

In [7]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [8]:
# print the title, description, and label of each example
for idx, row in xsum_sample.sample(2).iterrows():
    print("")
    print(f"Document: {row['document']}")
    print(f"Summary: {row['summary']}")


Document: Is there something you have seen or heard that you would like us to investigate?
It could be a burning issue or something you have always wondered about the area or its people.
Use the tool below to send us your questions.
We could be in touch and your question could make the news.
Summary: Have you got a question about Coventry & Warwickshire?

Document: A report by technology news site Gizmodo said staff responsible for what was shown to Facebook's 1.6bn users frequently chose to bury articles they did not agree with.
Responding to the allegations, the network's head of search Tom Stocky wrote that the site "found no evidence that the anonymous allegations are true".
The claims come weeks after Facebook founder Mark Zuckerberg publicly denounced the policies of likely US presidential nominee, Donald Trump.
"I hear fearful voices calling for building walls and distancing people they label as 'others.'," the 31-year-old said at his firm's recent developers conference.
Howeve

In [9]:
xsum_sample["combined_info"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [11]:
import tiktoken

embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
encoding = tiktoken.get_encoding(embedding_encoding)

# omit descriptions that are too long to embed
xsum_sample["n_tokens"] = xsum_sample.combined_info.apply(lambda x: len(encoding.encode(x)))
xsum_sample = xsum_sample[xsum_sample.n_tokens <= max_tokens]

In [12]:
xsum_sample.head(2)

Unnamed: 0,document,summary,id,combined_info,n_tokens
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,Document: The full cost of damage in Newton St...,510
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,Document: A fire alarm went off at the Holiday...,198


### Generate Embedding

https://github.com/openai/openai-cookbook/blob/4d373651822c3a27290078d713f14eeb1d8f5d3d/examples/Get_embeddings_from_dataset.ipynb

In [13]:
from openai.embeddings_utils import get_embedding

In [14]:
embedding_model = "text-embedding-3-small"
xsum_sample["embedding"] = xsum_sample.combined_info.apply(lambda x: get_embedding(x, engine=embedding_model))

In [17]:
xsum_sample.rename(columns = {'embedding': 'vector'}, inplace = True)
xsum_sample.rename(columns = {'combined_info': 'text'}, inplace = True)
xsum_sample.to_pickle('data/xsum_sample.pkl')

In [18]:
xsum_sample.to_csv("data/xsum_sample_1k.csv")

### Readin Files with Embedding

In [24]:
import lancedb
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import LanceDB
from langchain.chains import RetrievalQA

In [19]:
import lancedb

# connect to a database
uri = "dataset/xsum-lancedb"
db = lancedb.connect(uri)
table = db.create_table("xsum_sample", xsum_sample)

In [25]:
print(db.table_names())

['xsum_sample']


In [26]:
# openai_api_key = os.getenv('OPENAI_API_KEY')
openai_api_key=os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
docsearch = LanceDB(connection = table, embedding = embeddings)

In [29]:
query = "I'm looking for the information cost of damage in Newton Stewart. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)
docs

[Document(page_content='Document: The semi-official Iranian Students News Agency (Isna) said the fire was in an "explosive materials production unit".\nA pro-opposition website reported a huge blast near the Parchin military site, south-east of the capital, but this was not confirmed.\nParchin has been linked to Iran\'s controversial nuclear programme.\nThe International Atomic Energy Agency (IAEA) has not been given access to the complex since 2005.\nAnalysts say the IAEA suspects Iran of experimenting with explosives capable of triggering a nuclear weapon at Parchin.\nThe pro-reform website Sahamnews said the explosion on Sunday evening was so intense that windows of buildings 15km (nine miles) away were shattered.\nThe glare from the blast could also be seen from a great distance, the report added.\nIsna, quoting Iran\'s defence industries organisation, said: "Unfortunately, due to the incident, two workers of this production unit lost their lives."\nIt gave no further details.\nLas

### Build the chain

In [27]:
from langchain.llms import OpenAI

In [28]:
# Embeddings, store, and retrieval
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [31]:
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), input_key="question")

In [None]:
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
llm=OpenAI(deployment_name="text-davinci-003", model_name="text-davinci-003",),
chain_type="stuff", 
retriever=docsearch.as_retriever(), return_source_documents=True)

In [1]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

query = "I'm looking for an action anime. What could you suggest to me?"
result = qa({"query": query})
result['result']

NameError: name 'RetrievalQA' is not defined

In [33]:
# Embeddings, store, and retrieval
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

# Model and doc loader
from langchain import OpenAI
from langchain.document_loaders import TextLoader

# Eval!
from langchain.evaluation.qa import QAEvalChain

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

In [35]:
# Our long essay from before
loader = TextLoader('worked.txt')
doc = loader.load()

print (f"You have {len(doc)} document")
print (f"You have {len(doc[0].page_content)} characters in that document")

RuntimeError: Error loading worked.txt