In [20]:
from langchain_openai import AzureChatOpenAI
import snowflake.connector
from dotenv import load_dotenv
import os
import time
from datetime import datetime
from sqlalchemy import create_engine
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from tqdm import tqdm
import backoff
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
import pandas as pd
import numpy as np
from typing import List, Dict
import json
from concurrent.futures import ThreadPoolExecutor
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from io import StringIO
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

True

In [21]:
required_vars = {
    "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
    "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
    "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
    "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
    "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
    "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA")
}

print(required_vars["SNOWFLAKE_SCHEMA"])

TEST2


In [14]:
connection_string = (
    f"snowflake://{required_vars['SNOWFLAKE_USER']}:"
    f"{required_vars['SNOWFLAKE_PASSWORD']}@"
    f"{required_vars['SNOWFLAKE_ACCOUNT']}/"
    f"{required_vars['SNOWFLAKE_DATABASE']}/"
    f"{required_vars['SNOWFLAKE_SCHEMA']}?warehouse="
    f"{required_vars['SNOWFLAKE_WAREHOUSE']}"
)

engine = create_engine(connection_string)
print("Connected to Snowflake")

Connected to Snowflake


In [15]:
query = f"""
    SELECT 
        c.TABLE_NAME, c.COLUMN_NAME, c.DATA_TYPE, c.IS_NULLABLE, c.CHARACTER_MAXIMUM_LENGTH
    FROM {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.COLUMNS c
    JOIN {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.TABLES t 
        ON c.TABLE_NAME = t.TABLE_NAME
    WHERE t.TABLE_TYPE = 'BASE TABLE' 
    AND c.TABLE_SCHEMA = '{required_vars['SNOWFLAKE_SCHEMA']}'
"""

conn = engine.connect()
metadata = pd.read_sql(query, conn.connection)
metadata.columns = [col.lower() for col in metadata.columns]

print("\nAvailable tables:", metadata['table_name'].unique())
print(metadata.to_string)


Available tables: ['PATIENT_ADMISSIONS']
<bound method DataFrame.to_string of             table_name         column_name data_type is_nullable  \
0   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
1   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
2   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
3   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
4   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
5   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
6   PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
7   PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
8   PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
9   PATIENT_ADMISSIONS                 AGE    NUMBER         YES   
10  PATIENT_ADMISSIONS                 AGE    NUMBER         YES   
11  PATIENT_ADMISSIONS                 AGE    NUMBER         YES   
12  PATIENT_ADMISSIONS         ROOM_N

In [16]:
for i in metadata['table_name'].unique():
    print(i)

PATIENT_ADMISSIONS


In [17]:
table_name = metadata['table_name'].unique()[0]  # Get first table only

print(f"\nRetrieving data from table: {table_name}")

query = f"SELECT * FROM {required_vars['SNOWFLAKE_DATABASE']}.{required_vars['SNOWFLAKE_SCHEMA']}.{table_name}"
conn = engine.connect()
df = pd.read_sql(query, conn.connection)
print(f"Retrieved {len(df)} rows")

df.head(2)


Retrieving data from table: PATIENT_ADMISSIONS
Retrieved 11139 rows


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,JASmINe aGuIlaR,82,Male,AB+,Asthma,2020-07-01,Daniel Ferguson,Sons Rich and,Cigna,50119.22,316,Elective,2020-07-14,Aspirin,Abnormal
1,ChRISTopher BerG,58,Female,AB-,Cancer,2021-05-23,Heather Day,Padilla-Walker,UnitedHealthcare,19784.63,249,Elective,2021-06-22,Paracetamol,Inconclusive


In [18]:
model = AzureChatOpenAI(
    azure_endpoint=required_vars["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=required_vars["AZURE_OPENAI_4o_DEPLOYMENT_NAME"],
    openai_api_version=required_vars["AZURE_OPENAI_API_VERSION"],
    openai_api_key=required_vars["AZURE_OPENAI_API_KEY"],
)

In [19]:
chunk_size = 1500
offset = 0
analysis_results = []


while True:
    # Fetch data chunk
    query = f"""
    SELECT *
    FROM {table_name}
    LIMIT {chunk_size}
    OFFSET {offset}
    """
    chunk = pd.read_sql(query, conn)
    
    if chunk.empty:
        break

    data_quality_prompt = f"""
    Analyze this data sample ({len(chunk)} total rows, showing first 100):

    Table Data:
    {chunk.to_string()}

    Table Metadata:
    {metadata[['column_name', 'data_type']].to_string()}

    Please provide a comprehensive analysis focusing on:
    1. Data quality issues
    2. Pattern anomalies
    3. Potential sensitive data fields
    4. Suggested improvements

    Format your response as JSON with these keys:
    - data_quality_issues
    - recommended_solutions
    - sql_queries
    - sensitive_data_recommendations
    """

    system_prompt_quality = """You are a specialized data analyst expert in Snowflake databases.
    Analyze the provided data sample focusing on data quality and patterns.
    Keep responses focused and brief. Ensure JSON format."""

    messages_quality = [
        {"role": "system", "content": system_prompt_quality},
        {"role": "user", "content": data_quality_prompt}
    ]

    print("Generating data quality insights...")
    quality_response = model.invoke(messages_quality).content.replace("plaintext", "").replace("json", "").replace("```", "").strip()
    print("\nData quality insights generated")

    analysis_results.append({
        'chunk': offset // chunk_size + 1,
        'analysis': quality_response
    })
    
    offset += chunk_size
    
    if len(chunk) < chunk_size:
        break

# Print results
for result in analysis_results:
    print(f"\nAnalysis for chunk {result['chunk']}:")
    print(result['analysis'])



Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated
Generating data quality insights...

Data quality insights generated

Analysis for chunk 1:
{
  "data_quality_issues": [
    {
      "description": "Inconsistent casing in 'name' and 'doctor' fields.",
      "rows_affected": [0, 1, 3, 4, 9, 10]
    },
    {
      "description": "Inconsistent gender values (e.g., 'Male' and 'Female' for individuals with typically opposite gendered names).",
      "rows_affected": [0, 1, 10, 15]
    },
    {
      "description": "Negative or abnormally low 'billing_amount'.",
    