In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

# Step 1: Configure Snowflake connection parameters
SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_dfBASE = os.environ.get("SNOWFLAKE_dfBASE")
SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

print("Snowflake configuration loaded")

# Step 2: Create Snowflake connection
connection_string = (
    f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
    f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_dfBASE}/{SNOWFLAKE_SCHEMA}"
    f"?warehouse={SNOWFLAKE_WAREHOUSE}"
)
engine = create_engine(connection_string)
print("Snowflake connection established")

# Step 3: Get list of tables from Snowflake schema
query = f"""
SELECT 
    t.TABLE_NAME
FROM {SNOWFLAKE_dfBASE}.INFORMATION_SCHEMA.TABLES t 
WHERE t.TABLE_TYPE = 'BASE TABLE' 
AND t.TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA}'
"""
conn = engine.connect()
tables_df = pd.read_sql(query, conn.connection)
table_names = tables_df['TABLE_NAME'].tolist()
print(f"Found {len(table_names)} tables in schema")

Snowflake configuration loaded
Snowflake connection established
Found 3 tables in schema


In [None]:
# Step 4: Set the table to analyze
table_to_analyze = "PATIENT_ADMISSIONS"  # Replace with your table name

# Step 5: Get table df
query = f"SELECT * FROM {SNOWFLAKE_dfBASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
df = pd.read_sql(query, conn.connection)
df

Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,allen,45,M,abc,perfectly fine,2025-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2025-02-25,Lisinopril,Normal
1,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.28,328,Urgent,2024-02-02,Paracetamol,Normal
2,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.33,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
3,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.10,205,Emergency,2022-10-07,Aspirin,Normal
4,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78,450,Elective,2020-12-18,Ibuprofen,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55496,eLIZABeTH jaCkSOn,42,Female,O+,Asthma,2020-08-16,Joshua Jarvis,Jones-Thompson,Blue Cross,2650.71,417,Elective,2020-09-15,Penicillin,Abnormal
55497,KYle pEREz,61,Female,AB-,Obesity,2020-01-23,Taylor Sullivan,Tucker-Moyer,Cigna,31457.80,316,Elective,2020-02-01,Aspirin,Normal
55498,HEATher WaNG,38,Female,B+,Hypertension,2020-07-13,Joe Jacobs DVM,"and Mahoney Johnson Vasquez,",UnitedHealthcare,27620.76,347,Urgent,2020-08-10,Ibuprofen,Abnormal
55499,JENniFER JOneS,43,Male,O-,Arthritis,2019-05-25,Kimberly Curry,"Jackson Todd and Castro,",Medicare,32451.09,321,Elective,2019-05-31,Ibuprofen,Abnormal


In [5]:

from sklearn.model_selection import StratifiedShuffleSplit

# Check if 'target' column exists, if not, create a dummy one
if 'target' not in df.columns:
    print("No 'target' column found. Creating a dummy one for demonstration purposes.")
    df['target'] = np.random.choice(['A', 'B', 'C'], size=len(df), p=[0.4, 0.3, 0.3])

# Map target to numerical values if necessary
target_map = {'A': 0, 'B': 1, 'C': 2}  # Example mapping
df['target_num'] = df['target'].map(target_map)

# Define features (X) and target (y)
X = df.drop(['target', 'target_num'], axis=1)
y = df['target_num']

# Perform stratified sampling
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_sampled, y_sampled = X.iloc[test_index], y.iloc[test_index]
    sampled_indices = df.index[test_index]  # Store the indices of the sampled df
    
    # Create a sampled df frame
    sampled_df = df.loc[sampled_indices].copy()
    sampled_df['target_num'] = sampled_df['target'].map(target_map)

# Print basic info about the sampled df
print(f"Sampled df shape: {sampled_df.shape}")
print(sampled_df)

Sampled df shape: (5551, 17)
                     NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION  \
12158       keVin JeNKiNS   50  Female        AB+           Obesity   
24924        aNnE sTEwaRT   21    Male         B+         Arthritis   
19210    AShlEY aLexandER   34    Male        AB-         Arthritis   
17538          maRk SmItH   70  Female         O-            Cancer   
26399      rACHEL BenneTT   45    Male         B+            Asthma   
...                   ...  ...     ...        ...               ...   
6691   kImbeRLY rOdRIgUEz   44  Female        AB+            Asthma   
50786    dAnIel rOdRIGUEz   40  Female        AB+            Asthma   
30061    joshUa MADdox md   73  Female        AB+            Asthma   
19992       joHn THOMpsOn   66  Female        AB-            Asthma   
33049     rEgiNAlD fISHer   82  Female         O-      Hypertension   

      DATE_OF_ADMISSION            DOCTOR                         HOSPITAL  \
12158        2019-12-13      John Jackso

In [28]:
### optimized for shuffle

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# Check if 'target' column exists, if not, create a dummy one
if 'target' not in df.columns:
    print("No 'target' column found. Creating a dummy one for demonstration purposes.")
    df['target'] = np.random.choice(['A', 'B', 'C'], size=len(df), p=[0.4, 0.3, 0.3])

# Map target to numerical values if necessary
target_map = {'A': 0, 'B': 1, 'C': 2}  # Example mapping
df['target_num'] = df['target'].map(target_map)

# Define features (X) and target (y)
X = df.drop(['target', 'target_num'], axis=1)
y = df['target_num']

# Perform stratified sampling
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)

best_sample = None
best_criteria = df['target'].value_counts().std()  # Define your criteria here (e.g., class balance)
print(best_criteria)

for train_index, test_index in sss.split(X, y):
    sampled_indices = df.index[test_index]
    sampled_df = df.loc[sampled_indices].copy()
    
    # Calculate your criteria here (e.g., class balance)
    criteria = sampled_df['target'].value_counts().std()  # Example criteria
    print(criteria)
    
    if best_criteria is None or criteria < best_criteria:
        best_sample = sampled_df
        best_criteria = criteria

print("Best Sample:")
print(best_sample)



3171.2950561771027
316.69596355705784
316.69596355705784
316.69596355705784
316.69596355705784
316.69596355705784
Best Sample:
                     NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION  \
12158       keVin JeNKiNS   50  Female        AB+           Obesity   
24924        aNnE sTEwaRT   21    Male         B+         Arthritis   
19210    AShlEY aLexandER   34    Male        AB-         Arthritis   
17538          maRk SmItH   70  Female         O-            Cancer   
26399      rACHEL BenneTT   45    Male         B+            Asthma   
...                   ...  ...     ...        ...               ...   
6691   kImbeRLY rOdRIgUEz   44  Female        AB+            Asthma   
50786    dAnIel rOdRIGUEz   40  Female        AB+            Asthma   
30061    joshUa MADdox md   73  Female        AB+            Asthma   
19992       joHn THOMpsOn   66  Female        AB-            Asthma   
33049     rEgiNAlD fISHer   82  Female         O-      Hypertension   

      DATE_OF_ADMISS

In [24]:
required_vars = {
    "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
    "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
    "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
    "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
    "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
    "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA")
}

print(required_vars["SNOWFLAKE_SCHEMA"])

TEST3


In [25]:
from langchain_openai import AzureChatOpenAI


model = AzureChatOpenAI(
    azure_endpoint=required_vars["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=required_vars["AZURE_OPENAI_4o_DEPLOYMENT_NAME"],
    openai_api_version=required_vars["AZURE_OPENAI_API_VERSION"],
    openai_api_key=required_vars["AZURE_OPENAI_API_KEY"],
)

In [None]:
DQ_checks_Prompt = '''
        
'''