# Vanna Ai Exploration 


## Generate dataframe with random data

In [12]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import sqlite3
import os
#pip install vanna
# import vanna
# from vanna.remote import VannaDefault
#vanna_api_key = os.getenv('Vanna_API_Key')

In [13]:
#print(f"The value of 'MY_VARIABLE' is: {Vanna_API_Key}")

In [14]:
# Generate random patient IDs
patient_ids = [f'P{str(i).zfill(4)}' for i in range(1, 101)]

# Generate random service dates within the past two weeks
today = datetime.today()
service_dates = [today - timedelta(days=random.randint(0, 14)) for _ in range(100)]

# Sample list of chronic ICD-10 codes and their descriptions
icd_10_codes = [
    ('E11', 'Type 2 diabetes mellitus'),
    ('I10', 'Essential (primary) hypertension'),
    ('J45', 'Asthma'),
    ('M54', 'Dorsalgia (back pain)'),
    ('K50', 'Crohn’s disease'),
    ('K51', 'Ulcerative colitis')
]

# Generate random ICD-10 codes and descriptions
icd_10_data = [random.choice(icd_10_codes) for _ in range(100)]
icd_10_codes, icd_10_descriptions = zip(*icd_10_data)

# Create the DataFrame
df = pd.DataFrame({
    'patient_id': patient_ids,
    'service_date': service_dates,
    'icd_10_code': icd_10_codes,
    'icd_10_description': icd_10_descriptions
})

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,patient_id,service_date,icd_10_code,icd_10_description
0,P0001,2025-01-04 21:47:53.075805,I10,Essential (primary) hypertension
1,P0002,2025-01-05 21:47:53.075805,K50,Crohn’s disease
2,P0003,2025-01-09 21:47:53.075805,K51,Ulcerative colitis
3,P0004,2025-01-09 21:47:53.075805,K51,Ulcerative colitis
4,P0005,2025-01-14 21:47:53.075805,I10,Essential (primary) hypertension


# Load data to sqlite db

In [15]:

# Connect to a SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('chronic_disease_example.db')

# Save the DataFrame to the SQLite database
df.to_sql('patients', conn, if_exists='replace', index=False)

# Function to query the SQLite database using SQL
def query_dataframe(sql_query):
    return pd.read_sql_query(sql_query, conn)

# Example usage
sql_query = """
SELECT icd_10_description, COUNT(*) as number_of_patients 
FROM patients 
GROUP BY icd_10_description
--select * from patients
"""
result = query_dataframe(sql_query)
print(result)

# Close the connection
conn.close()


                 icd_10_description  number_of_patients
0                            Asthma                  11
1                   Crohn’s disease                  14
2             Dorsalgia (back pain)                  19
3  Essential (primary) hypertension                  20
4          Type 2 diabetes mellitus                  19
5                Ulcerative colitis                  17


# Use Vanna Ai (sqlite)

In [16]:
#https://vanna.ai/docs/sqlite-openai-vanna-vannadb/

Convert the DataFrame to a SQL table using SQLite and define a function query_dataframe that takes a plain English query,
 converts it to SQL using Vanna AI, and returns the results.

You can input any query in plain English,
 and the function will convert it to SQL and return the results.
 Make sure to replace query with your desired question.

In [19]:
import vanna
from vanna.remote import VannaDefault
vanna_api_key = os.getenv('Vanna_API_Key')
vanna_model_name = "disease"
vn = VannaDefault(model=vanna_model_name, api_key=vanna_api_key)

In [20]:
vn.connect_to_sqlite('chronic_disease_example.db')

In [21]:
df_ddl = vn.run_sql("SELECT type, sql FROM sqlite_master WHERE sql is not null")

for ddl in df_ddl['sql'].to_list():
  vn.train(ddl=ddl)

Adding ddl: CREATE TABLE "patients" (
"patient_id" TEXT,
  "service_date" TIMESTAMP,
  "icd_10_code" TEXT,
  "icd_10_description" TEXT
)


In [None]:
# # The following are methods for adding training data. Make sure you modify the examples to match your database.

# # DDL statements are powerful because they specify table names, colume names, types, and potentially relationships
# vn.train(ddl="""
#     CREATE TABLE IF NOT EXISTS my-table (
#         id INT PRIMARY KEY,
#         name VARCHAR(100),
#         age INT
#     )
# """)

# # Sometimes you may want to add documentation about your business terminology or definitions.
vn.train(documentation="Our business provides insight into disease incidence and prevelance.")

# # You can also add SQL queries to your training data. This is useful if you have some queries already laying around. You can just copy and paste those from your editor to begin generating new SQL.
vn.train(sql="SELECT icd_10_description, COUNT(*) as number_of_patients FROM patients GROUP BY icd_10_description")

In [22]:
# At any time you can inspect what training data the package is able to reference
training_data = vn.get_training_data()
training_data

Unnamed: 0,id,training_data_type,question,content
0,457448-ddl,ddl,,"CREATE TABLE ""patients"" (\n""patient_id"" TEXT,\..."
1,2819555-doc,documentation,,Our business provides insight into disease inc...
2,866745-sql,sql,What icd 10 description has the largest number...,"SELECT icd_10_description, COUNT(*) as number_..."
3,866744-sql,sql,What are the number of patients for each uniqu...,"SELECT icd_10_description, COUNT(*) as number_..."
4,463433-ddl,ddl,,"CREATE TABLE ""patients"" (\n""patient_id"" TEXT,\..."
5,463450-ddl,ddl,,"CREATE TABLE ""patients"" (\n""patient_id"" TEXT,\..."


In [None]:
# # You can remove training data if there's obsolete/incorrect information. 
#vn.remove_training_data(id='459809-ddl')
#457448

# ```## Asking the AI
# Whenever you ask a new question, it will find the 10 most relevant pieces of training data and use it as part of the LLM prompt to generate the SQL.
# ```python
# vn.ask(question=...)

In [None]:
from vanna.flask import VannaFlaskApp
app = VannaFlaskApp(vn)
app.run()

Your app is running at:
http://localhost:8084
 * Serving Flask app 'vanna.flask'
 * Debug mode: on
None
None
None
None
None


# Use Vanna (Postgres)

In [20]:
import vanna
from vanna.remote import VannaDefault
import os
vanna_api_key = os.getenv('Vanna_API_Key')
#api_key = # Your API key from https://vanna.ai/account/profile 
#vanna_model_name = # Your model name from https://vanna.ai/account/profile 
#vn.create_model (model = "titanic_model_demo", db_type = "Postgres")
vanna_model_name = 'demo_titanic1'
vn = VannaDefault(model=vanna_model_name, api_key=vanna_api_key)


In [21]:
vn.connect_to_postgres(host='localhost', dbname='example_data_db', user='postgres', password='pgpw', port='5432')

In [22]:
# The information schema query may need some tweaking depending on your database. This is a good starting point.
df_information_schema = vn.run_sql("SELECT * FROM INFORMATION_SCHEMA.COLUMNS")

# This will break up the information schema into bite-sized chunks that can be referenced by the LLM
plan = vn.get_training_plan_generic(df_information_schema)
plan

# If you like the plan, then uncomment this and run it to train
vn.train(plan=plan)

In [None]:
# # The following are methods for adding training data. Make sure you modify the examples to match your database.

# # DDL statements are powerful because they specify table names, colume names, types, and potentially relationships
# vn.train(ddl="""
#     CREATE TABLE IF NOT EXISTS dt_titanic (
#     passengerid text COLLATE pg_catalog."default",
#     survived text COLLATE pg_catalog."default",
#     pclass text COLLATE pg_catalog."default",
#     name text COLLATE pg_catalog."default",
#     sex text COLLATE pg_catalog."default",
#     age text COLLATE pg_catalog."default",
#     sibsp text COLLATE pg_catalog."default",
#     parch text COLLATE pg_catalog."default",
#     ticket text COLLATE pg_catalog."default",
#     fare text COLLATE pg_catalog."default",
#     cabin text COLLATE pg_catalog."default",
#     embarked text COLLATE pg_catalog."default"

#     )
# """)

# # Sometimes you may want to add documentation about your business terminology or definitions.
# vn.train(documentation="Our business is in the business of learning")

# # You can also add SQL queries to your training data. This is useful if you have some queries already laying around. You can just copy and paste those from your editor to begin generating new SQL.
# #vn.train(sql="SELECT * FROM my-table WHERE name = 'John Doe'")

In [24]:
# At any time you can inspect what training data the package is able to reference
training_data = vn.get_training_data()
training_data

Unnamed: 0,id,training_data_type,question,content
0,2842203-doc,documentation,,The following columns are in the collation_cha...
1,2842062-doc,documentation,,The following columns are in the pg_authid tab...
2,2842094-doc,documentation,,The following columns are in the pg_tables tab...
3,2842061-doc,documentation,,The following columns are in the pg_tablespace...
4,2842059-doc,documentation,,The following columns are in the pg_database t...


In [None]:
# You can remove training data if there's obsolete/incorrect information. 
vn.remove_training_data(id='1-ddl')

```## Asking the AI
Whenever you ask a new question, it will find the 10 most relevant pieces of training data and use it as part of the LLM prompt to generate the SQL.
```python
vn.ask(question=...)

In [None]:
# most_recent_sql = vn.generate_sql(most_recent_question, allow_llm_to_see_data=True)

In [None]:
from vanna.flask import VannaFlaskApp
app = VannaFlaskApp(vn)
app.run()

Your app is running at:
http://localhost:8084
 * Serving Flask app 'vanna.flask'
 * Debug mode: on
None
None
None
None
None
None
None
None
None
None


In [None]:
# #https://github.com/vanna-ai/vanna-streamlit-simple/tree/main

# # Import python packages
# import streamlit as st

# # ======= BEGIN VANNA SETUP =======

# from vanna.remote import VannaDefault

# vanna_model_name=st.secrets.vanna.vanna_model_name
# vanna_api_key = st.secrets.vanna.vanna_api_key
# vn = VannaDefault(model=vanna_model_name, api_key=vanna_api_key)

# # vn.connect_to...(YOUR_DATABASE_CREDENTIALS)
# # example using suprabase
# supra_host=st.secrets.supra.supra_host
# supra_user=st.secrets.supra.supra_user
# supra_password=st.secrets.supra.supra_password

# vn.connect_to_postgres(host=supra_host, dbname='postgres', user=supra_user, password=supra_password, port=5432)

# # ======= END VANNA SETUP =======

# my_question = st.session_state.get("my_question", default=None)

# if my_question is None:
#     my_question = st.text_input(
#         "Ask me a question about your data",
#         key="my_question",
#     )
# else:
#     st.text(my_question)
    
#     sql = vn.generate_sql(my_question)

#     st.text(sql)

#     df = vn.run_sql(sql)    
        
#     st.dataframe(df, use_container_width=True)

#     code = vn.generate_plotly_code(question=my_question, sql=sql, df=df)

#     fig = vn.get_plotly_figure(plotly_code=code, df=df)

#     st.plotly_chart(fig, use_container_width=True)