# CassIO main RAG demo - C*Summit 2023

Contents:
1. setup
2. create vector store with CassIO
3. populate store
4. vector search
5. RAG
6. cleanup

Prerequisites:
- OpenAI API Key
- docker installed
- Python 3.8+

## 1. Setup

### Setup A - start Cassandra

In a terminal/console, launch `docker run --name my-cassandra -d cassandra:5.0-alpha2`, wait a couple of minutes.

Verify status with `docker exec -it my-cassandra  nodetool status` (wait until getting `UN ...` in output).

Get contact point with `docker inspect my-cassandra | jq -r '.[].NetworkSettings.Networks.bridge.IPAddress'`:

In [None]:
! docker inspect my-cassandra | jq -r '.[].NetworkSettings.Networks.bridge.IPAddress'

Then adjust and execute this cell:

In [None]:
CONTACT_POINT = "172.17.0.2"

### Setup B: dependencies

In [None]:
!pip install --quiet "cassio>=0.1.3" "openai>=1.0.0" datasets

### Setup C: provision DB & create session

In [None]:
from cassandra.cluster import Cluster

cluster = Cluster([CONTACT_POINT])
session = cluster.connect()

KEYSPACE = "cassio_demo"

session.execute(
    f"CREATE KEYSPACE IF NOT EXISTS {KEYSPACE} WITH REPLICATION = {{'class': 'SimpleStrategy', 'replication_factor': 1}};"
)

### Setup D: global CassIO init

In [None]:
import cassio

cassio.init(session=session, keyspace=KEYSPACE)

### Setup E: OpenAI

In [None]:
import os
from getpass import getpass

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API Key: ")

In [None]:
import openai

openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
embedding_model_name = "text-embedding-ada-002"

## 2. Create vector store

In [None]:
cassio_v_store = cassio.table.MetadataVectorCassandraTable(
    table="cassio_demo",
    vector_dimension=1536,
)

## 3. Populate store

In [None]:
import json
import datasets

ento_dataset = datasets.load_dataset("datastax/entomology")["train"]

def _shorten(dct): return {k: v if len(v) < 60 else v[:60]+"..." for k, v in dct.items()}

print(f"Loaded {len(ento_dataset)} entries")

In [None]:
species0 = ento_dataset[0]

print("Example entry:")
print("\n".join(
    f"    {l}" for l in json.dumps(_shorten(species0), indent=4).split("\n")
))

#### Write a row (with its vector)

In [None]:
# This is a 1536-float array:
emb_vector0 = openai_client.embeddings.create(
    input=[species0["description"]],
    model=embedding_model_name,
).data[0].embedding

cassio_v_store.put(
    row_id=species0["id"],
    body_blob=species0["description"],
    vector=emb_vector0,
    metadata={"name": species0["name"], "order": species0["order"]},
)

## Write all remaining rows

In [None]:
ids1 = ento_dataset["id"][1:]
descriptions1 = ento_dataset["description"][1:]
names1 = ento_dataset["name"][1:]
orders1 = ento_dataset["order"][1:]

embs1 = openai_client.embeddings.create(
    input=descriptions1,
    model=embedding_model_name,
).data

In [None]:
futures = [
    cassio_v_store.put_async(
        row_id=ids1[species_i],
        body_blob=descriptions1[species_i],
        vector=emb.embedding,
        metadata={"name": names1[species_i], "order": orders1[species_i]},
    )
    for species_i, emb in enumerate(embs1)
]

for future in futures:
    _ = future.result()

## 4. Vector search

In [None]:
query = "There was a dragonfly with fire-red wings in the woods"

query_vector = openai_client.embeddings.create(
    input=query,
    model=embedding_model_name,
).data[0].embedding

results = cassio_v_store.metric_ann_search(query_vector, n=6, metric="cos")

for res in results:
    print(f"[{res['distance']:.3f}] {res['metadata']['name']} ({res['metadata']['order']})\n        ==> '{res['body_blob'][:60]}...'\n")

#### Metadata filtering

In [None]:
query = "Behold: blue beetley bug boldly buzzing!"

query_vector = openai_client.embeddings.create(
    input=query,
    model=embedding_model_name,
).data[0].embedding

results = cassio_v_store.metric_ann_search(query_vector, n=3, metadata={"order": "Coleoptera"}, metric="cos")

for res in results:
    print(f"[{res['distance']:.3f}] {res['metadata']['name']} ({res['metadata']['order']})\n        ==> '{res['body_blob'][:60]}...'\n")

## 5. RAG, aka 'field entomologist AI-assisted aide'

In [None]:
completion_model_name = "gpt-3.5-turbo"

PROMPT_TEMPLATE = """You are an expert entomologist tasked with helping specimen identification on the field.
You are given relevant excerpts from an invertebrate textbook along with my field observation.
Your task is to compare my observation with the textbook excerpts and come to an identification,
explaining why you came to that conclusion and giving the degree of certainity.
Only use the information provided in the user observation to come to your conclusion!
Be sure to provide, in your verdict, the species' Order together with the full Latin name.
KEEP IT SHORT!!!

USER OBSERVATION: {observation}

TEXTBOOK CANDIDATE MATCHES:
{candidates}

YOUR EXPLAINED IDENTIFICATION:"""

In [None]:
def suggest_observed_species(observation, order=None, n=3, debug=False):
    query_vector = openai_client.embeddings.create(input=observation, model=embedding_model_name).data[0].embedding
    if order is not None:
        metadata = {"order": order}
    else:
        metadata = {}
    #
    matches = cassio_v_store.metric_ann_search(query_vector, n=n, metadata=metadata, metric="cos")
    #
    prompt = PROMPT_TEMPLATE.format(
        observation=observation,
        candidates="\n".join([
            f"""Candidate species {i+1}: '{doc['metadata']['name']}' (order: {doc['metadata']['order']})
            Description: {doc['body_blob']}\n"""
            for i, doc in enumerate(matches)
        ]),
    )
    #
    if debug:
        print('-' * 60)
        print('PROMPT:')
        print(prompt)
        print('-' * 60)
    #
    response = openai_client.chat.completions.create(
        model=completion_model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=320,
    )
    return response.choices[0].message.content.replace('"', '').strip()

In [None]:
print(suggest_observed_species("""
    I found an elongated brown bug with small wings,
    dark elitra and sturdy antennae in a meadow.
"""))

In [None]:
print(suggest_observed_species("""
    What looked like a leaf was in fact moving! It startled me greatly.
    But I'm not sure it's an insect, I did not see antennae. What was it?
"""))

In [None]:
print(suggest_observed_species(
    "There was a large butterfly with erratic flight, but I could not glimpse the wing pattern clearly",
    order="Lepidoptera",
    debug=True,
))

## 6. Cleanup

In [None]:
c_session = cassio.config.resolve_session()
c_keyspace = cassio.config.resolve_keyspace()

c_session.execute(f"DROP TABLE IF EXISTS {c_keyspace}.cassio_demo;")