# Note to the reader: this is to provide a Colab version of the "the-onion" quickstart. To be validate and moved in a suitable location

In [4]:
!pip install -q cassio datasets langchain openai tiktoken

In [9]:
# COLAB-ONLY PROPOSAL:
ASTRA_DB_APPLICATION_TOKEN = 'AstraCS:...' # enter the "AstraCS:..." string found in in your Token JSON file
# (otherwise: path to the json file all right)

ASTRA_DB_ID = '01234567-...' # enter your Database ID
OPENAI_API_KEY = 'sk-...' # enter your OpenAI key

# Vector support using Langchain, Apache Cassandra (Astra DB is built using
# Cassandra), and OpenAI (to generate embeddings)
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

import cassio

In [10]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

ERROR:cassandra.connection:Closing connection <AsyncoreConnection(140714706867232) 4f835778-ec78-42b0-9ae3-29e3cf45b596-us-east1.db.astra.datastax.com:29042:f07b10c5-f2b9-48aa-942d-2b906062c418> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


In [11]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
myEmbedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [12]:
myCassandraVStore = Cassandra(
    embedding=myEmbedding,
    session=None,
    keyspace=None,
    table_name="qa_mini_demo",
)

In [13]:
print("Loading data from huggingface")
myDataset = load_dataset("Biddls/Onion_News", split="train")
headlines = myDataset["text"][:50]

print("\nGenerating embeddings and storing in AstraDB")
myCassandraVStore.add_texts(headlines)

print("Inserted %i headlines.\n" % len(headlines))

vectorIndex = VectorStoreIndexWrapper(vectorstore=myCassandraVStore)

Loading data from huggingface


Downloading readme:   0%|          | 0.00/463 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]


Generating embeddings and storing in AstraDB
Inserted 5 headlines.



In [16]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ")
        first_question = False
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ")

    if query_text.lower() == "quit":
        break

    print("QUESTION: \"%s\"" % query_text)
    answer = vectorIndex.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("DOCUMENTS BY RELEVANCE:")
    for doc, score in myCassandraVStore.similarity_search_with_score(query_text, k=4):
        print("  %0.4f \"%s ...\"" % (score, doc.page_content[:60]))


Enter your question (or type 'quit' to exit): Was there a diploma scheme? Tell me more
QUESTION: "Was there a diploma scheme? Tell me more"
ANSWER: "Yes, federal agents have arrested 25 suspects accused of selling fake nursing degrees to thousands of students who then used the bogus diplomas to take licensing exams in several states, including Florida, New York, New Jersey, and Texas."

DOCUMENTS BY RELEVANCE:
  0.8986 "25 Arrested In Fake Nursing School Diploma Scheme #~# Federa ..."
  0.8621 "Kamala Harris Asks Communications Assistant If She Can Take  ..."
  0.8557 "U.S. Officials Call For Correct Amount Of Violence #~# WASHI ..."
  0.8509 "Relaxed Marie Kondo Now Says She Perfectly Happy Living In W ..."

What's your next question (or type 'quit' to exit): quit
