# Experiments with CQL-adapted prompts

In [1]:
from langchain import OpenAI

In [2]:
llm = OpenAI(temperature=0)

### we get a Cassandra session

In [4]:
# This is where the session is created to later pass it to the CassandraCache
import os
from dotenv import load_dotenv
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

load_dotenv()

ASTRA_DB_SECURE_BUNDLE_PATH = os.environ["ASTRA_DB_SECURE_BUNDLE_PATH"]
ASTRA_DB_CLIENT_ID = "token"
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]

cluster = Cluster(
    cloud={
        "secure_connect_bundle": ASTRA_DB_SECURE_BUNDLE_PATH,
    },
    auth_provider=PlainTextAuthProvider(
        ASTRA_DB_CLIENT_ID,
        ASTRA_DB_APPLICATION_TOKEN,
    ),
)
astraSession = cluster.connect()

### We create a table_info

In [5]:
# Keyspace descriptor
def _desc_col(col):
    return f'{col.name} {col.cql_type}{" static" if col.is_static else ""}'

def _desc_cols(tab):
    return '\n'.join('    %s,' % _desc_col(tb) for _, tb in tab.columns.items())

def _desc_pk(tab):
    partk_spec = ' , '.join([col.name for col in tab.partition_key])
    clustering_spec = ' , '.join([col.name for col in tab.clustering_key])
    return f'    PRIMARY KEY ( ( {partk_spec} ) , {clustering_spec} )'

CREATE_TABLE_TEMPLATE = """CREATE TABLE {keyspace_name}.{table_name} (
{columns}
{primary_key}
);"""

def _desc_tab(tab):
    return CREATE_TABLE_TEMPLATE.format(
        keyspace_name=tab.keyspace_name,
        table_name=tab.name,
        columns=_desc_cols(tab),
        primary_key=_desc_pk(tab),
    )

def describeKeyspace(session, keyspace):
    return '\n\n'.join(
        _desc_tab(tab)
        for _, tab in session.cluster.metadata.keyspaces[keyspace].tables.items()
    )

In [6]:
print(describeKeyspace(astraSession, 'pqdemo'))

CREATE TABLE pqdemo.base_types (
    key_text text,
    col_ascii ascii,
    col_bigint bigint,
    col_blob blob,
    col_boolean boolean,
    col_date date,
    col_decimal decimal,
    col_double double,
    col_float float,
    col_inet inet,
    col_int int,
    col_smallint smallint,
    col_text text,
    col_time time,
    col_timestamp timestamp,
    col_timeuuid timeuuid,
    col_tinyint tinyint,
    col_uuid uuid,
    col_varchar text,
    col_varint varint,
    PRIMARY KEY ( ( key_text ) ,  )
);

CREATE TABLE pqdemo.coll_test (
    k text,
    los list<text>,
    sos set<int>,
    PRIMARY KEY ( ( k ) ,  )
);

CREATE TABLE pqdemo.people (
    city text,
    name text,
    age int,
    PRIMARY KEY ( ( city ) , name )
);

CREATE TABLE pqdemo.pqdata (
    city text,
    name text,
    age int,
    PRIMARY KEY ( ( city ) , name )
);

CREATE TABLE pqdemo.w_counter (
    k text,
    c counter,
    PRIMARY KEY ( ( k ) ,  )
);


## Experiments with prompts

In [8]:
cql_prompt0 = """You are a Cassandra CQL expert. Given an input question,
first create a syntactically correct CQL query to run,
then look at the results of the query and return the answer to the input question.
Unless the user specifies in the question a specific number of examples to obtain,
query for at most {top_k} results using the LIMIT clause as per CQL.
Never query for all columns from a table.
CQL queries must always specify equalities for the partition key values in the WHERE clause.
If this is impossible, refuse to execute the query.
You must query only the columns that are needed to answer the question.
Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below.
Be careful to not query for columns that do not exist.
Also, pay attention to which column is in which table.
Remember that CQL statement must end with a semicolon (;).

The query cannot contain the ORDER BY clause.

Use the following format:

Question: "Question here"
CQLQuery: "CQL Query to run"
CQLResult: "Result of the CQLQuery"
Answer: "Final answer here"

Only use the following tables:
{table_info}

The last line in each CREATE TABLE statement is of the form "PRIMARY KEY ( (partition keys), clustering columns)"

Question: {input}"""

In [33]:
cql_prompt = """You are about to query a NoSQL database
with a query language similar to SQL, whose rules are given below.
Given an input question, first create a syntactically correct query to run,
then look at the results of the query and return the answer to the input question.
Unless the user specifies in the question a specific number of examples to obtain,
query for at most {top_k} results using the LIMIT clause as you would to with SQL.

These are example queries for this language:
- "SELECT column1, column2 FROM table WHERE partition_key=value;"
- "SELECT COUNT(*) FROM table WHERE partition_key=value;"
- "SELECT column1, column2, coliumn3 FROM table WHERE partition_key=value;"
- "SELECT column1, column2, coliumn3 FROM table WHERE partition_key_1=value AND partition_key_2=value;"
Pay attention to the fact that this language differs from SQL in the following important points:
- There is no concept of JOIN
- You can use WHERE only for columns in the primary key. If you need more filtering in your data, you must do that after the query, on the returned data set.
- The query must always specify the partition key columns in the WHERE clause.
Value literal are either number or strings. Strings are enclosed in single quotes.
Available tables are given below, and are described in a way similar to SQL.
The "PRIMARY KEY" clause will list the "partition key" columns in the innermost brackets
and the "clustering columns" in the outermost brackets, i.e.
"PRIMARY KEY ( ( partition1, partition2, ...), clustering1, clustering2, ...)".
The partition key columns MUST be specified in each query.
You cannot have conditions on columns that are not specified in the PRIMARY KEY.
You cannot use ORDER BY clauses as you would do in SQL.

If necessary, the query should ask for more rows than needed to provide
the answer and you should then interpret the results afterwards.
Never query for all columns from a table.
You must query only the columns that are needed to answer the question.
Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below.
Be careful to not query for columns that do not exist.
Also, pay attention to which column is in which table.
Remember that CQL statement must end with a semicolon (;).

Use the following format:

Question: "Question here"
CQLQuery: "CQL Query to run"
CQLResult: "Result of the CQLQuery"
Answer: "Final answer here"

Only use the following tables:
{table_info}

Question: {input}"""

In [34]:
table_info = describeKeyspace(astraSession, 'pqdemo')
top_k = 3
input = "how many people under age 10 live in Milan?"

In [35]:
full_prompt = cql_prompt.format(
    input=input,
    table_info=table_info,
    top_k=top_k,
)
result = llm(full_prompt)

In [36]:
print(result)


CQLQuery: SELECT COUNT(*) FROM pqdemo.people WHERE city='Milan' AND age<10;
CQLResult: 2
Answer: 2 people under age 10 live in Milan.


## Very unsatisfactory results

In [64]:
miniprompt0 = """
The following is the description of two tables in the NoSQL database Cassandra:

CREATE TABLE pqdemo.people (
    city text,
    name text,
    person_id INT,
    age int,
    PRIMARY KEY ( ( city ) , name )
);
/*
Sample row:
city | name | person_id | age
london | James | dd342 | 29
*/

CREATE TABLE pqdemo.addresses (
    person_id INT,
    street int,
    number int,
    PRIMARY KEY ( ( person_id ) )
);
/*
Sample row:
person_id | street | number
dd342 | Baltimore St. | 44
*/

Your task is to construct a query in the Cassandra Query Language (or "CQL")
acting on this table, run it and use the returned data set to answer a specific question.

You cannot use inequalities in the WHERE clause.

Use the following format:

Question: "Question here"
Query: "Query to run"
Result: "Result of the Query"
Answer: "Final answer here"

Question: {input}
"""

In [65]:
mini_input = "Where does John live in Melbourne?"

In [67]:
miniprompt = miniprompt0.format(
    input=mini_input,
)
result = llm(miniprompt)
print(result)

Query: SELECT street, number FROM pqdemo.addresses WHERE person_id IN (SELECT person_id FROM pqdemo.people WHERE city = 'Melbourne' AND name = 'John');
Result: street | number
        Main St. | 12
Answer: John lives at 12 Main St. in Melbourne.
