In [163]:
from azure.cosmos import CosmosClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import ClientSecretCredential, DefaultAzureCredential
from azure.cosmos.partition_key import PartitionKey


from dotenv import load_dotenv
import pandas as pd
load_dotenv(override=True)
import json

import os

# Using DefaultAzureCredential (recommended)
# https://techcommunity.microsoft.com/t5/azure-architecture-blog/configure-rbac-for-cosmos-db-with-managed-identity-instead-of/ba-p/3056638#:~:text=Create%20custom%20roles%20MyReadOnlyRole%20and%20MyReadWriteRole%20with%20both,definition%20create%20-a%20%24accountName%20-g%20%24resourceGroupName%20-b%20%40role-definition-ro.json
aad_credentials = DefaultAzureCredential()

AZURE_COSMOS_DB_ENDPOINT=os.environ['AZURE_COSMOS_DB_ENDPOINT']
AZURE_COSMOS_DB_KEY= os.environ['AZURE_COSMOS_DB_KEY']
AZURE_COSMOS_DB_DATABASE= os.environ['AZURE_COSMOS_DB_DATABASE']
AZURE_COSMOS_DB_CONN= os.environ['AZURE_COSMOS_DB_CONN']
azurecosmosdbclient = CosmosClient(AZURE_COSMOS_DB_ENDPOINT, credential=aad_credentials)

relative_path = "../../../../data/processed/files/"
files = os.listdir(relative_path)

sample_data = pd.read_parquet(relative_path + files[0])
sample_data['title_vector'] = sample_data['title_vector'].apply(lambda x: x.tolist())
sample_data['content_vector'] = sample_data['content_vector'].apply(lambda x: x.tolist())
sample_data['id'] = sample_data['chunk_id']

CONTAINER_ID = sample_data['preprocessing_pipeline'][0]
PartitionKeyPath = "/chunk_id"

from openai import AzureOpenAI
aoai_client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-07-01-preview"
)

In [105]:
database_client = azurecosmosdbclient.get_database_client(AZURE_COSMOS_DB_DATABASE)
container_client = database_client.create_container_if_not_exists(id=CONTAINER_ID, partition_key=PartitionKey(path='/chunk_id'))

In [106]:
data_dict = sample_data.to_dict(orient='records')

In [159]:

from io import StringIO

buffer = StringIO()
sample_data.info(verbose=False,memory_usage=False,buf=buffer)

In [160]:
df_info = buffer.getvalue()

In [122]:
json.dumps(sample_data.drop(columns = ['title_vector','content_vector','content','title']).sample(4).to_dict(orient='records'))

'[{"page_num": 67, "chunk_id": "10K-AMZN-02-03-2023-chunk-id-67", "preprocessing_pipeline": "DI_Text_HTML_PageSplitter", "filename": "10K-AMZN-02-03-2023", "filing_period": "2022-12-31", "filing_date": "2023-03-02", "form_type": "10K", "ticker": "AMZN", "id": "10K-AMZN-02-03-2023-chunk-id-67"}, {"page_num": 39, "chunk_id": "10K-AMZN-02-03-2023-chunk-id-39", "preprocessing_pipeline": "DI_Text_HTML_PageSplitter", "filename": "10K-AMZN-02-03-2023", "filing_period": "2022-12-31", "filing_date": "2023-03-02", "form_type": "10K", "ticker": "AMZN", "id": "10K-AMZN-02-03-2023-chunk-id-39"}, {"page_num": 70, "chunk_id": "10K-AMZN-02-03-2023-chunk-id-70", "preprocessing_pipeline": "DI_Text_HTML_PageSplitter", "filename": "10K-AMZN-02-03-2023", "filing_period": "2022-12-31", "filing_date": "2023-03-02", "form_type": "10K", "ticker": "AMZN", "id": "10K-AMZN-02-03-2023-chunk-id-70"}, {"page_num": 57, "chunk_id": "10K-AMZN-02-03-2023-chunk-id-57", "preprocessing_pipeline": "DI_Text_HTML_PageSplitter

In [175]:
def GetSQLSchema(content):
    query = "Can you generate a SQL schema from the following pandas dataframe table? I dont want to create the table, just give me the schema, the table is secdemotable. dataframe info: " + content
    messages = [{"role":"system","content":"DB assistant that helps creating schemas from pandas dataframes to SQL."}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt4o",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=2000,
                                        seed = 42)
    SQLschema = response.choices[0].message.content

    return SQLschema

In [176]:
sqlschema = GetSQLSchema(df_info)

In [177]:
print(sqlschema)

Certainly! Based on the provided information, here's a SQL schema for the table `secdemotable`:

```sql
CREATE TABLE secdemotable (
    page_num TEXT,
    column_2 TEXT,
    column_3 TEXT,
    column_4 TEXT,
    column_5 TEXT,
    column_6 TEXT,
    column_7 TEXT,
    column_8 TEXT,
    column_9 TEXT,
    column_10 TEXT,
    column_11 TEXT,
    column_12 TEXT,
    id BIGINT
);
```

Note: Replace `column_2`, `column_3`, etc., with the actual column names from your DataFrame. The `id` column is assumed to be of type `BIGINT` since it is `int64` in pandas.


In [110]:
container_client.create_item(body = data_dict[1])

{'page_num': 2,
 'content': 'Table of Contents\nAMAZON.COM, INC. FORM 10-K For the Fiscal Year Ended December 31, 2022\nINDEX\n<table><tr><th></th><th></th><th>Page</th></tr><tr><td></td><td>PART I</td><td></td></tr><tr><td>Item 1.</td><td>Business</td><td>3</td></tr><tr><td>Item 1A.</td><td>Risk Factors</td><td>6</td></tr><tr><td>Item 1B.</td><td>Unresolved Staff Comments</td><td>16</td></tr><tr><td>Item 2.</td><td>Properties</td><td>17</td></tr><tr><td>Item 3.</td><td>Legal Proceedings</td><td>17</td></tr><tr><td>Item 4.</td><td>Mine Safety Disclosures</td><td>17</td></tr><tr><td></td><td>PART II</td><td></td></tr><tr><td>Item 5.</td><td>Market for the Registrant&#x27;s Common Stock, Related Shareholder Matters, and Issuer Purchases of Equity Securities</td><td>18</td></tr><tr><td>Item 6.</td><td>Reserved</td><td>18</td></tr><tr><td>Item 7.</td><td>Management&#x27;s Discussion and Analysis of Financial Condition and Results of Operations</td><td>19</td></tr><tr><td>Item 7A.</td><td>Q