#### Importing Libraries

In [4]:
from azure.cosmos import CosmosClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import ClientSecretCredential, DefaultAzureCredential
from azure.cosmos.partition_key import PartitionKey


from dotenv import load_dotenv
import pandas as pd
load_dotenv(override=True)
import json

import os

# Using DefaultAzureCredential (recommended)
# https://techcommunity.microsoft.com/t5/azure-architecture-blog/configure-rbac-for-cosmos-db-with-managed-identity-instead-of/ba-p/3056638#:~:text=Create%20custom%20roles%20MyReadOnlyRole%20and%20MyReadWriteRole%20with%20both,definition%20create%20-a%20%24accountName%20-g%20%24resourceGroupName%20-b%20%40role-definition-ro.json
aad_credentials = DefaultAzureCredential()

AZURE_COSMOS_DB_ENDPOINT=os.environ['AZURE_COSMOS_DB_ENDPOINT']
AZURE_COSMOS_DB_KEY= os.environ['AZURE_COSMOS_DB_KEY']
AZURE_COSMOS_DB_DATABASE= os.environ['AZURE_COSMOS_DB_DATABASE']
AZURE_COSMOS_DB_CONN= os.environ['AZURE_COSMOS_DB_CONN']
azurecosmosdbclient = CosmosClient(AZURE_COSMOS_DB_ENDPOINT, credential=AZURE_COSMOS_DB_KEY)

from openai import AzureOpenAI
aoai_client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-10-21"
)

#### Reading input files

In [5]:
relative_path = "../../../../data/processed/files/"
files = os.listdir(relative_path)

df = pd.concat([pd.read_parquet(relative_path+file) for file in files]).reset_index(drop=True)
df['title_vector'] = df['title_vector'].apply(lambda x: x.tolist())
df['content_vector'] = df['content_vector'].apply(lambda x: x.tolist())
df['id'] = df['chunk_id']

In [6]:
df.head()

Unnamed: 0,page_num,content,title,title_vector,content_vector,chunk_id,preprocessing_pipeline,filename,filing_period,filing_date,form_type,ticker,id
0,1,Table of Contents\nUNITED STATES SECURITIES AN...,FORM 10-K,"[0.06034832075238228, 0.05649842694401741, 0.0...","[0.05381559208035469, 0.026631174609065056, 0....",10K-AMZN-02-03-2023-chunk-id-1,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-1
1,2,"Table of Contents\nAMAZON.COM, INC. FORM 10-K ...","AMAZON.COM, INC. FORM 10-K For the Fiscal Year...","[0.036177292466163635, 0.02635868266224861, 0....","[0.022779833525419235, 0.041926249861717224, 0...",10K-AMZN-02-03-2023-chunk-id-2,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-2
2,3,"Table of Contents\nAMAZON.COM, INC.\nPART I\nI...",Item 1. Business,"[0.005116552114486694, 0.010721826925873756, 0...","[0.043223753571510315, 0.009905443526804447, 0...",10K-AMZN-02-03-2023-chunk-id-3,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-3
3,4,Table of Contents\nCompetition\nOur businesses...,Competition,"[0.03811872377991676, -0.02396334894001484, 0....","[0.04903312399983406, 0.028485119342803955, 0....",10K-AMZN-02-03-2023-chunk-id-4,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-4
4,5,Table of Contents\nAvailable Information\nOur ...,Executive Officers and Directors,"[0.0003115860163234174, -0.014203808270394802,...","[0.019433414563536644, -0.01925777457654476, 0...",10K-AMZN-02-03-2023-chunk-id-5,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-5


#### Instantiate CosmosDB database and container clients

In [7]:
CONTAINER_ID = df['preprocessing_pipeline'][0]
PartitionKeyPath = "/chunk_id"

database_client = azurecosmosdbclient.get_database_client(AZURE_COSMOS_DB_DATABASE)
container_client = database_client.create_container_if_not_exists(id=CONTAINER_ID, partition_key=PartitionKey(path='/chunk_id'))

#### Load data into container

In [8]:
data_dict = df.to_dict(orient='records')

for chunk in data_dict:
    container_client.create_item(body = chunk)

#### Querying the container

In [9]:
prompt_template = """You are a SQL programmer Assistant. Your role is to generate CosmosDB No SQL code (CosmosDB NoSQL) to retrieve an answer to a natural language query. Make sure to disambiguate column names when creating queries that use more than one table. If a valid SQL query cannot be generated, only say "ERROR:" followed by why it cannot be generated.
            Do not answer any questions on inserting or deleting rows from the table. Instead, say "ERROR: I am not authorized to make changes to the data".

            Use the following database schema to write CosmosDB NoSQL queries:
            {}(page_num INTEGER, content VARCHAR, title VARCHAR, chunk_id VARCHAR, preprocessing_pipeline VARCHAR, filename VARCHAR, filing_period VARCHAR, filing_date VARCHAR, form_type VARCHAR, ticker VARCHAR PRIMARY KEY (chunk_id))

            ## Relevant comments abuout the data
            ** This data is already chunked, meaning that the same file might have different chunks, each with a different chunk_id. The chunk_id is the primary key for this table. So if you wanted to know how many filings have been made, you would count the number of unique filename.
            ** both filing_period and filing date are in the format YYYY-MM-DD
            ** ticker is the stock ticker of the company that made the filing
            ** form_type contains information related to SEC form type, for example 10K, 10Q. This fields should help when determining whether to use quarterly or yearly reports

            ## Additional guidance
            ** Please only answer questions that can be answered with the data provided. Do not make any assumptions about the data.
            ** Please write the code for a CosmosDB NoSQL query
            ** Please make sure that you use the right DISTINCT 
            ** Always use dates in the format YYYY-MM-DD
            ** If you need to ask any clafication, please ask for it in the format JSON format with the key: CLARIFICATION
            ** If asked about any date, default to filing_date unless the question implies that is a question about the filing period.
            ** Dates should be explicity stated in the query. For example, if you are asked about filings in 2023, you should use the following format: '2023-01-01' and '2023-12-31'. Otehrwise, ask for the specific period.
            ** Provide the answer in a JSON format with the key: NoSQLquery

            ## Sample queries

            BETWEEN Clasue:
               ** Statement: You can use the BETWEEN keyword with a WHERE clause to express queries that filters results against ranges of string or numerical values. For example, the following query returns all items in which the price is between 17.25 and 25.50, again inclusive.
               ** Specific intructions: always put the BETWEEN clause in parenthesis (c.price BETWEEN 17.25 AND 25.50)
               ** NoSQL Query: SELECT * FROM c WHERE (c.price BETWEEN 17.25 AND 25.50)

            DISTINCT and COUNT Clause:
               ** Statement: You can combine the DISTINCT and COUNT keywords to return the number of unique items in a result set. For example, the following query returns the number of unique values of the price field.
               ** NoSQL Query: SELECT VALUE COUNT(1) FROM (SELECT DISTINCT c.price FROM c)
            
            Additional query:
               ** Explanation: For this query, the index matches any item that has a tag with a name of either "winter" or "fall", at least one quantity between zero and ten, and at least one warehouse where the backstock is false. The JOIN expression here performs the cross-product of all items of tags, onHandQuantities, and warehouseStock arrays for each matching item before any filter is applied. The WHERE clause then applies the filter predicate on each <c, t, n, s> tuple. For instance, if a matching item had ten items in each of the three arrays, it expands to 1 x 10 x 10 x 10 (that is, 1,000) tuples. Using subqueries here can help in filtering out joined array items before joining with the next expression.
               ** query: SELECT VALUE COUNT(1) FROM products p JOIN t in p.tags JOIN q in p.onHandQuantities JOIN s in p.warehouseStock WHERE t.name IN ("winter", "fall") AND (q.quantity BETWEEN 0 AND 10) AND NOT s.backstock

            Sample Query:
               ** question: How many filings exist for 2023 on this container?
               ** query: "SELECT VALUE COUNT(1) FROM (SELECT DISTINCT c.filename FROM c WHERE (c.filing_date BETWEEN '2023-01-01' AND '2023-12-31'))"
            
            questions: """.format(CONTAINER_ID)

In [10]:
def GetNOSQLSchema(prompt_template):
    query = "How many quarterly filings for MSFT were made on 2022?"
    messages = [{"role":"system","content":prompt_template}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt-4.1",  
                                        messages = messages, 
                                        temperature=0.2,  
                                        max_tokens=2000,
                                        response_format={ "type": "json_object" },
                                        seed = 42)
    SQLschema = response.choices[0].message.content
    print(SQLschema)

    return json.loads(SQLschema)['NoSQLquery']

In [11]:
query = GetNOSQLSchema(prompt_template)
print(query)

{
  "NoSQLquery": "SELECT VALUE COUNT(1) FROM (SELECT DISTINCT c.filename FROM c WHERE c.ticker = 'MSFT' AND c.form_type = '10Q' AND (c.filing_date BETWEEN '2022-01-01' AND '2022-12-31'))"
}
SELECT VALUE COUNT(1) FROM (SELECT DISTINCT c.filename FROM c WHERE c.ticker = 'MSFT' AND c.form_type = '10Q' AND (c.filing_date BETWEEN '2022-01-01' AND '2022-12-31'))


In [12]:
[i for i in container_client.query_items(query=query, enable_cross_partition_query=True)]

[3]