In [1]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

# Step 1: Configure Snowflake connection parameters
SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

print("Snowflake configuration loaded")

# Step 2: Create Snowflake connection
connection_string = (
    f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
    f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
    f"?warehouse={SNOWFLAKE_WAREHOUSE}"
)
engine = create_engine(connection_string)
print("Snowflake connection established")

# Step 3: Get list of tables from Snowflake schema
query = f"""
SELECT 
    t.TABLE_NAME
FROM {SNOWFLAKE_DATABASE}.INFORMATION_SCHEMA.TABLES t 
WHERE t.TABLE_TYPE = 'BASE TABLE' 
AND t.TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA}'
"""
conn = engine.connect()
tables_df = pd.read_sql(query, conn.connection)
table_names = tables_df['TABLE_NAME'].tolist()
print(f"Found {len(table_names)} tables in schema")

# Step 4: Set the table to analyze
table_to_analyze = "PATIENT_ADMISSIONS"  # Replace with your table name

# Step 5: Get table data
query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
df = pd.read_sql(query, conn.connection)
print(f"Loaded {len(df)} rows from {table_to_analyze}")

# Step 6: Prepare data for anomaly detection
print("Preparing data for anomaly detection")

# Make a copy of the dataframe for encoding
df_encoded = df.copy()

# Identify categorical columns (object and category dtypes)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Found {len(categorical_cols)} categorical columns")

# Encode categorical columns - THIS IS THE FIX
if categorical_cols:
    # Create encoder with correct parameters BEFORE using it
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # Then fit and transform
    df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])

# Fill any remaining NaNs with 0
df_encoded = df_encoded.fillna(0)

print(f"Data encoded with {df_encoded.shape[1]} features")

# Display sample of encoded data
print("Sample of encoded data:")
display(df_encoded.head(2))

# Step 7: Train anomaly detection model
print("Training Isolation Forest model")
model = IsolationForest(
    contamination=0.001,
    random_state=42,
    n_estimators=100
)

model.fit(df_encoded)

# Step 8: Predict anomalies
print("Detecting anomalies")
anomaly_scores = model.decision_function(df_encoded)
anomaly_labels = model.predict(df_encoded)

# Step 9: Add scores and anomaly flags to original data
df['anomaly_score'] = anomaly_scores
df['is_anomaly'] = anomaly_labels == -1

# Get anomalies and sort by score
anomalies = df[df['is_anomaly']].sort_values('anomaly_score')

print(f"Found {len(anomalies)} anomalous records out of {len(df)} total records")

display(anomalies)


if len(anomalies) > 0:
    print("\nSample of anomalous records (top 5 most anomalous):")
    print(anomalies.head())
    
    # Step 10: Save anomalies to CSV
    output_file = f"{table_to_analyze}_anomalies.csv"
    anomalies.to_csv(output_file, index=False)
    print(f"Saved anomalies to {output_file}")
else:
    print("No anomalies detected in this table")

# Close connection when done
conn.close()
engine.dispose()
print("Connection closed")

Snowflake configuration loaded
Snowflake connection established
Found 3 tables in schema
Loaded 55501 rows from PATIENT_ADMISSIONS
Preparing data for anomaly detection
Found 12 categorical columns
Data encoded with 15 features
Sample of encoded data:


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,26448.0,45,1.0,8.0,6.0,1827.0,11860.0,4811.0,3.0,1500.75,203,1.0,1856.0,3.0,2.0
1,3068.0,30,2.0,5.0,2.0,1729.0,26613.0,29934.0,1.0,18856.28,328,2.0,1730.0,4.0,2.0


Training Isolation Forest model
Detecting anomalies
Found 56 anomalous records out of 55501 total records


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS,anomaly_score,is_anomaly
3306,ALBerT wALteRs,80,Female,A+,Arthritis,2019-05-11,Shelby Walters,"Cox Stone, and Merritt",Medicare,4699.31,108,Elective,2019-06-09,Penicillin,Abnormal,-0.020288,True
47586,ANGeLA haWkiNs,67,Female,A+,Hypertension,2024-04-10,Anthony Green,Ferguson-West,UnitedHealthcare,7991.89,497,Urgent,2024-04-16,Paracetamol,Abnormal,-0.019144,True
13322,ChRIsTina rOBertS,50,Male,O-,Arthritis,2024-03-08,Christine Graham,"Lopez Jordan, Clark and",UnitedHealthcare,5377.57,491,Elective,2024-03-31,Paracetamol,Abnormal,-0.015631,True
12542,pATrICk gUTiERrez dvM,21,Female,O-,Obesity,2023-12-05,William Lynch,Johnson Inc,UnitedHealthcare,49577.49,438,Elective,2023-12-20,Aspirin,Abnormal,-0.015357,True
50603,cYNthia BrOWn,21,Female,A-,Obesity,2019-05-16,Matthew Stewart,Booker-Mitchell,Aetna,45800.39,490,Elective,2019-05-30,Penicillin,Abnormal,-0.014084,True
23995,cYNthia BrOWn,21,Female,A-,Obesity,2019-05-16,Matthew Stewart,Booker-Mitchell,Aetna,45800.39,490,Elective,2019-05-30,Penicillin,Abnormal,-0.014084,True
23577,ricHaRD BArKER,81,Male,O-,Diabetes,2024-04-02,Troy Choi,"Bray, Wilson Alvarez and",Medicare,45620.19,489,Elective,2024-04-23,Aspirin,Abnormal,-0.013013,True
2572,jonATHAn grEen,81,Female,A+,Arthritis,2024-05-02,Jason Hart,"Nelson, Rush and Moore",Aetna,1518.44,102,Urgent,2024-05-13,Lipitor,Abnormal,-0.011995,True
43311,nANCy WHiTeHEad,22,Male,A+,Arthritis,2019-12-15,Eric Johnson,Walls LLC,Aetna,48582.64,125,Urgent,2019-12-16,Paracetamol,Abnormal,-0.010901,True
4886,craIG AndErSON,84,Male,A+,Obesity,2019-06-16,Catherine Soto,Tran LLC,UnitedHealthcare,3182.16,138,Urgent,2019-07-02,Aspirin,Abnormal,-0.008595,True



Sample of anomalous records (top 5 most anomalous):
                        NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION  \
3306          ALBerT wALteRs   80  Female         A+         Arthritis   
47586         ANGeLA haWkiNs   67  Female         A+      Hypertension   
13322      ChRIsTina rOBertS   50    Male         O-         Arthritis   
12542  pATrICk gUTiERrez dvM   21  Female         O-           Obesity   
50603          cYNthia BrOWn   21  Female         A-           Obesity   

      DATE_OF_ADMISSION            DOCTOR                 HOSPITAL  \
3306         2019-05-11    Shelby Walters   Cox Stone, and Merritt   
47586        2024-04-10     Anthony Green            Ferguson-West   
13322        2024-03-08  Christine Graham  Lopez Jordan, Clark and   
12542        2023-12-05     William Lynch              Johnson Inc   
50603        2019-05-16   Matthew Stewart          Booker-Mitchell   

      INSURANCE_PROVIDER  BILLING_AMOUNT  ROOM_NUMBER ADMISSION_TYPE  \
3306     

## GRID SEARCH CV

In [4]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
from sklearn.metrics import make_scorer
warnings.filterwarnings("ignore")
load_dotenv(override=True)

class AnomalyDetector:
    def __init__(self, contamination=0.001):
        self.contamination = contamination
        self.models = {
            'isolation_forest': None,
            'local_outlier_factor': None,
            'robust_covariance': None
        }
        self.best_params = {}
        
    def prepare_data(self, df):
        """Prepare data for anomaly detection"""
        df_encoded = df.copy()
        
        # Encode categorical columns
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_cols:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])
        
        # Scale numerical features
        scaler = StandardScaler()
        df_encoded = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)
        display(df_encoded.head())
        return df_encoded.fillna(0)

    def grid_search_isolation_forest(self, X):
        """Perform grid search for IsolationForest"""
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_samples': [0.3, 0.5, 0.9],
            'contamination': ['auto', 0.0002, 0.0005],
            'max_features': [0.5, 0.7, 1.0]
        }
        
        base_model = IsolationForest(random_state=42)
        
        # Custom scorer for anomaly detection
        scorer = make_scorer(lambda y_true, y_pred: -np.mean(np.square(y_pred)), greater_is_better=False)
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=3,
            scoring=scorer,
            n_jobs=-1
        )
        
        grid_search.fit(X)
        self.best_params['isolation_forest'] = grid_search.best_params_
        return grid_search.best_estimator_

    def detect_anomalies(self, df):
        """Detect anomalies using multiple methods"""
        X = self.prepare_data(df)
        
        # Initialize results dictionary
        results = {
            'scores': {},
            'labels': {},
            'ensemble_score': None,
            'ensemble_label': None
        }
        
        # 1. Isolation Forest with Grid Search
        print("Training Isolation Forest with Grid Search...")
        self.models['isolation_forest'] = self.grid_search_isolation_forest(X)
        results['scores']['isolation_forest'] = self.models['isolation_forest'].decision_function(X)
        results['labels']['isolation_forest'] = self.models['isolation_forest'].predict(X)
        
        # 2. Local Outlier Factor
        print("Training Local Outlier Factor...")
        self.models['local_outlier_factor'] = LocalOutlierFactor(
            contamination=self.contamination,
            n_neighbors=20,
            n_jobs=-1
        )
        results['labels']['lof'] = self.models['local_outlier_factor'].fit_predict(X)
        results['scores']['lof'] = self.models['local_outlier_factor'].negative_outlier_factor_
        
        # 3. Robust Covariance (Elliptic Envelope)
        print("Training Robust Covariance...")
        self.models['robust_covariance'] = EllipticEnvelope(
            contamination=self.contamination,
            random_state=42
        )
        results['labels']['robust_covariance'] = self.models['robust_covariance'].fit_predict(X)
        results['scores']['robust_covariance'] = self.models['robust_covariance'].decision_function(X)
        
        # Ensemble scoring
        results['ensemble_score'] = np.mean([
            MinMaxScaler().fit_transform(results['scores'][model].reshape(-1, 1)).flatten()
            for model in results['scores']
        ], axis=0)
        
        # Ensemble labels (majority voting)
        results['ensemble_label'] = np.mean([
            results['labels'][model] == -1 for model in results['labels']
        ], axis=0) >= 0.5
        
        return results

def main():
    # Snowflake connection setup
    SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
    SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
    SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
    SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
    SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
    SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

    connection_string = (
        f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
        f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
        f"?warehouse={SNOWFLAKE_WAREHOUSE}"
    )
    
    engine = create_engine(connection_string)
    print("Snowflake connection established")

    # Get table to analyze
    table_to_analyze = "PATIENT_ADMISSIONS" # PATIENT_ADMISSIONS:55k rows, CUSTOMER_ADDRESS: 50 million rows
    
    # Load data
    query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    
    print(f"Loaded {len(df)} rows from {table_to_analyze}")
    
    # Initialize and run anomaly detection
    detector = AnomalyDetector(contamination=0.0001)
    results = detector.detect_anomalies(df)
    
    # Add results to original dataframe
    df['ensemble_anomaly_score'] = results['ensemble_score']
    df['is_anomaly'] = results['ensemble_label']
    
    # Get anomalies and sort by ensemble score
    anomalies = df[df['is_anomaly']].sort_values('ensemble_anomaly_score')
    
    print(f"\nFound {len(anomalies)} anomalous records out of {len(df)} total records")
    
    if len(anomalies) > 0:
        print("\nTop 5 most anomalous records:")
        print(anomalies.head())
        
        # Save anomalies to CSV
        output_file = f"{table_to_analyze}_anomalies_enhanced.csv"
        anomalies.to_csv(output_file, index=False)
        print(f"\nSaved anomalies to {output_file}")
        
        # Print best parameters from grid search
        print("\nBest parameters for Isolation Forest:")
        print(detector.best_params['isolation_forest'])
    else:
        print("No anomalies detected in this table")
    
    engine.dispose()
    print("Connection closed")
    

if __name__ == "__main__":
    main()

Snowflake connection established
Loaded 55514 rows from PATIENT_ADMISSIONS


Unnamed: 0,name,age,gender,blood_type,medical_condition,date_of_admission,doctor,hospital,insurance_provider,billing_amount,room_number,admission_type,discharge_date,medication,test_results
0,1.643691,0.173699,-0.000648,1.187255,0.520391,1.739453,-0.722685,-1.440965,0.075958,-1.655559,0.033469,-3.463716,1.768408,0.467559,1.868911
1,1.643898,0.426211,-2.000433,0.510697,0.13001,1.741344,-0.731963,-1.335862,0.882984,-1.669665,0.866367,-2.664328,1.770299,-0.530905,2.668549
2,1.407503,0.931236,-0.000648,-0.50414,1.691535,1.743235,-0.744883,-0.936059,-1.538096,-1.620349,1.742645,0.533223,1.77219,1.133202,-0.530005
3,-0.395605,-0.331327,-2.000433,-1.180699,-0.650753,1.745126,-0.721731,1.09751,1.286498,-1.578156,2.601571,-1.86494,1.774081,1.798846,-2.129282
4,1.643829,0.173699,-0.000648,1.187255,0.520391,1.739453,-0.722685,-1.440965,0.075958,-1.655559,0.033469,-3.463716,1.768408,0.467559,1.868911


Training Isolation Forest with Grid Search...
Training Local Outlier Factor...
Training Robust Covariance...

Found 7 anomalous records out of 55514 total records

Top 5 most anomalous records:
      name  age gender blood_type    medical_condition date_of_admission  \
9   test_2   45      F         AA              perfect        2008-02-20   
13  test_5   40   Male             high blood pressure        2000-02-20   
8   test_1  450      M        abc       perfectly fine        2025-02-20   
11  test_4  400      F         AA  high blood pressure        2025-02-20   
10  test_3  450      M        XYZ         Hypertension        2025-02-20   

       doctor       hospital insurance_provider  billing_amount  room_number  \
9   Dr. Smith  City Hospital    HealthCare Inc.         1500.75          203   
13  Dr. Smith  City Hospital    HealthCare Inc.         1500.75            2   
8   Dr. Smith  City Hospital    HealthCare Inc.         1500.75          203   
11  Dr. Smith  City Hospital 

## line by line

In [51]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
from sklearn.metrics import make_scorer

# Suppress warnings
warnings.filterwarnings("ignore")

# Load environment variables
load_dotenv(override=True)

True

In [52]:
required_vars = {
    "AZURE_OPENAI_ENDPOINT": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "AZURE_OPENAI_4o_DEPLOYMENT_NAME": os.environ.get("AZURE_OPENAI_4o_DEPLOYMENT_NAME"),
    "AZURE_OPENAI_API_VERSION": os.environ.get("AZURE_OPENAI_API_VERSION"),
    "AZURE_OPENAI_API_KEY": os.environ.get("AZURE_OPENAI_API_KEY"),
    "SNOWFLAKE_USER": os.environ.get("SNOWFLAKE_USER"),
    "SNOWFLAKE_PASSWORD": os.environ.get("SNOWFLAKE_PASSWORD"),
    "SNOWFLAKE_ACCOUNT": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "SNOWFLAKE_WAREHOUSE": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "SNOWFLAKE_DATABASE": os.environ.get("SNOWFLAKE_DATABASE"),
    "SNOWFLAKE_SCHEMA": os.environ.get("SNOWFLAKE_SCHEMA")
}

print(required_vars["SNOWFLAKE_SCHEMA"])

TEST2


In [53]:
connection_string = (
    f"snowflake://{required_vars['SNOWFLAKE_USER']}:"
    f"{required_vars['SNOWFLAKE_PASSWORD']}@"
    f"{required_vars['SNOWFLAKE_ACCOUNT']}/"
    f"{required_vars['SNOWFLAKE_DATABASE']}/"
    f"{required_vars['SNOWFLAKE_SCHEMA']}?warehouse="
    f"{required_vars['SNOWFLAKE_WAREHOUSE']}"
)

engine = create_engine(connection_string)
print("Connected to Snowflake")

Connected to Snowflake


In [54]:
query = f"""
    SELECT 
        c.TABLE_NAME, c.COLUMN_NAME, c.DATA_TYPE, c.IS_NULLABLE, c.CHARACTER_MAXIMUM_LENGTH
    FROM {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.COLUMNS c
    JOIN {required_vars['SNOWFLAKE_DATABASE']}.INFORMATION_SCHEMA.TABLES t 
        ON c.TABLE_NAME = t.TABLE_NAME
    WHERE t.TABLE_TYPE = 'BASE TABLE' 
    AND c.TABLE_SCHEMA = '{required_vars['SNOWFLAKE_SCHEMA']}'
"""

conn = engine.connect()
metadata = pd.read_sql(query, conn.connection)
metadata.columns = [col.lower() for col in metadata.columns]

print("\nAvailable tables:", metadata['table_name'].unique())
print(metadata.to_string)


Available tables: ['PATIENT_ADMISSIONS' 'CUSTOMER_ADDRESS']
<bound method DataFrame.to_string of             table_name         column_name data_type is_nullable  \
0   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
1   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
2   PATIENT_ADMISSIONS      DISCHARGE_DATE      DATE         YES   
3   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
4   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
5   PATIENT_ADMISSIONS   DATE_OF_ADMISSION      DATE         YES   
6     CUSTOMER_ADDRESS          CA_COUNTRY      TEXT         YES   
7     CUSTOMER_ADDRESS      CA_STREET_NAME      TEXT         YES   
8     CUSTOMER_ADDRESS            CA_STATE      TEXT         YES   
9   PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
10  PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
11  PATIENT_ADMISSIONS  INSURANCE_PROVIDER      TEXT         YES   
12    CUSTOMER_ADD

In [55]:
for i in metadata['table_name'].unique():
    print(i)

PATIENT_ADMISSIONS
CUSTOMER_ADDRESS


In [58]:
table_name = metadata['table_name'].unique()[0]  # Get first table only

print(f"\nRetrieving data from table: {table_name}")

query = f"SELECT * FROM {required_vars['SNOWFLAKE_DATABASE']}.{required_vars['SNOWFLAKE_SCHEMA']}.{table_name}"
conn = engine.connect()
df = pd.read_sql(query, conn.connection)
print(f"Retrieved {len(df)} rows")

df['NAME'] = df['NAME'].str.title()

df.head(20)


Retrieving data from table: PATIENT_ADMISSIONS
Retrieved 55514 rows


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,Test_5@@@###,55,M,O+,Diabetes,2025-02-21,Dr. Sm1th_!!,C!ty H0sp!tal,HealthCare Inc.,2000.5,305,!!!@##ER_ADMISSION###,2025-02-27,Metformin,Stable
1,Test_6_Error_Case,60,F,B-,Chronic pain1234,2025-02-22,Dr. John Doe,City Hospital,No Coverage!!,1800.0,401,0utp@tient123,2025-02-28,Ibuprofen-500MG!,Unkn0wn??
2,Random_Text_123,70,M,AB+,Severe cold & fever!!!,2025-02-23,Dr. A.I. Bot,General Hosp1tal@@,###None###,2500.99,502,UnexpectedInput<>?,2025-03-01,Paracetamol & Zinc,Critical but stable
3,Patient_999$$$,45,F,A-,Asthma_&_COPD???,2025-02-24,Dr. Strange_M.D.,Unknown Facility 007,Self-Pay!!,3100.75,601,Admissi0nTypeError123,2025-03-02,Ventolin Inhaler+,###Inconclusive###
4,Test_68@@@###,55,M,O+,Diabetes,2025-02-21,Dr. Sm1th_!!,C!ty H0sp!tal,HealthCare Inc.,2000.5,305,!!!@##ER_ADMISSION###,2025-02-27,Metformin,Stable
5,Test_9_Error_Case,60,F,B-,Chronic pain1234,2025-02-22,Dr. John Doe,City Hospital,No Coverage!!,1800.0,401,0utp@tient123,2025-02-28,Ibuprofen-500MG!,Unkn0wn??
6,Random_Text_2344,70,M,AB+,Severe cold & fever!!!,2025-02-23,Dr. A.I. Bot,General Hosp1tal@@,###None###,2500.99,502,UnexpectedInput<>?,2025-03-01,Paracetamol & Zinc,Critical but stable
7,Patient_000$$$,45,F,A-,Asthma_&_COPD???,2025-02-24,Dr. Strange_M.D.,Unknown Facility 007,Self-Pay!!,3100.75,601,Admissi0nTypeError123,2025-03-02,Ventolin Inhaler+,###Inconclusive###
8,Test_1,450,M,abc,perfectly fine,2025-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2025-02-25,,Normal
9,Test_2,45,F,AA,perfect,2008-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2040-02-25,Lisinopril,critical


In [59]:
model = AzureChatOpenAI(
    azure_endpoint=required_vars["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=required_vars["AZURE_OPENAI_4o_DEPLOYMENT_NAME"],
    openai_api_version=required_vars["AZURE_OPENAI_API_VERSION"],
    openai_api_key=required_vars["AZURE_OPENAI_API_KEY"],
)

In [60]:
df_encoded = df.copy()

In [61]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Encode categorical columns
if categorical_cols:
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])

# Scale features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)
X = X.fillna(0)

In [79]:
display(df_encoded.head())

Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,37298.0,55,2.0,9.0,5.0,1830.0,11862.0,3644.0,4.0,2000.5,305,0.0,1858.0,6.0,5.0
1,37301.0,60,0.0,7.0,4.0,1831.0,11755.0,4812.0,6.0,1800.0,401,1.0,1859.0,3.0,6.0
2,31359.0,70,2.0,4.0,8.0,1832.0,11606.0,9255.0,0.0,2500.99,502,5.0,1860.0,8.0,2.0
3,30273.0,45,0.0,2.0,2.0,1833.0,11873.0,31854.0,7.0,3100.75,601,2.0,1861.0,10.0,0.0
4,37300.0,55,2.0,9.0,5.0,1830.0,11862.0,3644.0,4.0,2000.5,305,0.0,1858.0,6.0,5.0


In [62]:
# grid search cv
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': [0.3, 0.5, 0.9],
    'contamination': ['auto', 0.0002, 0.0005],
    'max_features': [0.5, 0.7, 1.0]
}

In [63]:
base_model = IsolationForest(random_state=42)
scorer = make_scorer(lambda y_true, y_pred: -np.mean(np.square(y_pred)), greater_is_better=False)


# y pred is prdictions made by the model and y_true is the actual target variable
# This code creates a scoring system that:

# Takes predictions from a model
# Squares them
# Finds their average
# Makes that average negative
# Considers lower scores better thats why greater_is_better=False




In [64]:
print("Running Isolation Forest grid search...")
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, cv=3, scoring=scorer, n_jobs=-1)

# uses all combintions from grid given above
# cv=3: Uses 3-fold cross-validation (splits data into 3 parts for validation)
# n_jobs=-1: Uses all available CPU cores to speed up the search

grid_search.fit(X)


Running Isolation Forest grid search...


In [65]:
best_if_model = grid_search.best_estimator_

if_scores = best_if_model.decision_function(X)
# More negative scores mean more normal data
# More positive scores mean more likely to be anomalies

if_labels = best_if_model.predict(X)
# predict() returns binary labels: usually 1 for anomalies and -1 for normal data
# These are like the final "yes/no" decisions about whether each point is an anomaly

In [66]:
print("Running Local Outlier Factor...")
lof = LocalOutlierFactor(contamination=0.0001, n_neighbors=20, n_jobs=-1)
# Expects 0.01% of data points to be anomalies

lof_labels = lof.fit_predict(X)
# Returns labels: -1 for anomalies, 1 for normal points


lof_scores = lof.negative_outlier_factor_
# Gets the anomaly scores for each data point
# More negative values indicate stronger anomalies
# Values closer to zero indicate normal points

Running Local Outlier Factor...


In [67]:
print("Running Robust Covariance...")
robust_cov = EllipticEnvelope(contamination=0.0001, random_state=42)
robust_cov.fit(X)
rc_labels = robust_cov.predict(X)
rc_scores = robust_cov.decision_function(X)

Running Robust Covariance...


In [68]:
print("Calculating ensemble scores...")
scores_list = [
    MinMaxScaler().fit_transform(if_scores.reshape(-1, 1)).flatten(),
    MinMaxScaler().fit_transform(lof_scores.reshape(-1, 1)).flatten(),
    MinMaxScaler().fit_transform(rc_scores.reshape(-1, 1)).flatten()
]
ensemble_score = np.mean(scores_list, axis=0)

Calculating ensemble scores...


In [69]:
# Calculate ensemble labels
labels_list = [if_labels == -1, lof_labels == -1, rc_labels == -1]
ensemble_label = np.mean(labels_list, axis=0) >= 0.5

In [70]:
df['ensemble_anomaly_score'] = ensemble_score
df['is_anomaly'] = ensemble_label


In [71]:
anomalies = df[df['is_anomaly']].sort_values('ensemble_anomaly_score')

print(f"\nFound {len(anomalies)} anomalies in {len(df)} records")


Found 7 anomalies in 55514 records


In [72]:
if len(anomalies) > 0:
    print("\nTop 5 anomalies:")
    display(anomalies.head(100))
    
    # Save to CSV
    output_file = f"{table_to_analyze}_anomalies_enhanced.csv"
    anomalies.to_csv(output_file, index=False)
    print(f"\nSaved to {output_file}")
    
    print("\nBest Isolation Forest parameters:")
    print(grid_search.best_params_)
else:
    print("No anomalies found")


Top 5 anomalies:


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS,ensemble_anomaly_score,is_anomaly
9,Test_2,45,F,AA,perfect,2008-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2040-02-25,Lisinopril,critical,0.332119,True
13,Test_5,40,Male,,high blood pressure,2000-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,2,Emergency,2025-02-26,Lisinopril,file not found,0.342057,True
8,Test_1,450,M,abc,perfectly fine,2025-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2025-02-25,,Normal,0.354104,True
11,Test_4,400,F,AA,high blood pressure,2025-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,some random text data,2025-02-26,Lisinopril,critical,0.377766,True
10,Test_3,450,M,XYZ,Hypertension,2025-02-20,Dr. Smith,City Hospital,HealthCare Inc.,1500.75,203,Emergency,2025-02-25,Lisinopril,Normal,0.409103,True
3,Patient_999$$$,45,F,A-,Asthma_&_COPD???,2025-02-24,Dr. Strange_M.D.,Unknown Facility 007,Self-Pay!!,3100.75,601,Admissi0nTypeError123,2025-03-02,Ventolin Inhaler+,###Inconclusive###,0.717151,True
4,Test_68@@@###,55,M,O+,Diabetes,2025-02-21,Dr. Sm1th_!!,C!ty H0sp!tal,HealthCare Inc.,2000.5,305,!!!@##ER_ADMISSION###,2025-02-27,Metformin,Stable,0.72429,True



Saved to PATIENT_ADMISSIONS_anomalies_enhanced.csv

Best Isolation Forest parameters:
{'contamination': 'auto', 'max_features': 0.5, 'max_samples': 0.3, 'n_estimators': 100}


In [73]:
# conn closee
# engine.dispose()

In [77]:


# Only proceed if anomalies were found
if len(anomalies) > 0:
    # Get metadata for the table
    metadata_query = f"""
    SELECT column_name, data_type
    FROM information_schema.columns
    WHERE table_name = '{table_to_analyze}'
    """
    metadata = pd.read_sql(metadata_query, engine)

    # table sample:
    data_sample = df.sample(n=5).to_string()


    # Initialize Azure OpenAI
    model = AzureChatOpenAI(
        azure_endpoint=required_vars["AZURE_OPENAI_ENDPOINT"],
        azure_deployment=required_vars["AZURE_OPENAI_4o_DEPLOYMENT_NAME"],
        openai_api_version=required_vars["AZURE_OPENAI_API_VERSION"],
        openai_api_key=required_vars["AZURE_OPENAI_API_KEY"],
    )

    # Simple stats for context
    stats = {}
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            stats[col] = {
                'mean': df[col].mean(),
                'std': df[col].std()
            }

    # Simplified prompt
    analysis_prompt = f"""
    Table: {table_to_analyze}
    Records: {len(df)}
    Anomalies Found: {len(anomalies)}

    Anomaly Sample:
    {anomalies.head(10).to_string()}

    Column Statistics:
    {json.dumps(stats, indent=2)}

    Please analyze these anomalies and provide:
    1. Verify which rows are genuine anomalies vs false positives
    2. Specific data quality issues found
    3. Root cause analysis for each genuine anomaly
    4. SQL queries to find similar issues
    5. Compliance and sensitive data concerns + masking techniques like PII, PHI, PCI, HIPAA, GDPR etc. based on table metadata and table sample records as below:
    {data_sample}

    Strictly follow:
    - Provide specific solution based on the issues mentioned
    - Don't add any extra lines other than solutions
    - Ensure steps are applied to every column
    - Don't mix up solutions for different tables
    - Provide specific issues with wrong values
    - Don't use brackets
    - List all inconsistencies with their values
    - Include all discrepancy values in examples

    Expected JSON format:
    {{
        "verified_anomalies": [
            {{
                "row_id": "id",
                "is_genuine": true/false,
                "reason": "explanation"
            }}
        ],
        "quality_issues": [
            {{
                "issue": "description",
                "columns": ["affected columns"],
                "fix": "suggested fix"
            }}
        ],
        "sql_checks": [
            {{
                "purpose": "what to check",
                "query": "SQL query"
            }}
        ],
        "sensitive_data_compliance_suggestions": [
            {{
                "column": "column name",
                "compliance_standard": ["applicable standards"],
                "masking_technique": "suggested technique",
                "justification": "why this field needs masking"
            }}
        ]
    }}
    """

    # System prompt
    system_prompt = """You are a specialized data quality expert.
    Analyze the anomalies focusing on genuine issues versus false positives.
    Provide specific, actionable insights in the required JSON format.
    Be thorough but concise in your analysis."""

    # Get LLM response
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": analysis_prompt}
    ]

    print("Analyzing anomalies...")
    response = model.invoke(messages).content
    
    # Parse response
    response_clean = response.replace("```json", "").replace("```", "").strip()
    analysis_results = json.loads(response_clean)
    
    # Save JSON
    output_file = f"{table_to_analyze}_analysis.json"
    with open(output_file, 'w') as f:
        json.dump(analysis_results, f, indent=2)
    
    # Convert to Excel
    verified_df = pd.DataFrame(analysis_results["verified_anomalies"])
    issues_df = pd.DataFrame(analysis_results["quality_issues"])
    sql_df = pd.DataFrame(analysis_results["sql_checks"])
    sensitive_compliance_df = pd.DataFrame(analysis_results["sensitive_data_compliance_suggestions"])

    # Save Excel
    excel_output = f"{table_to_analyze}_analysis.xlsx"
    with pd.ExcelWriter(excel_output) as writer:
        verified_df.to_excel(writer, sheet_name='Verified_Anomalies', index=False)
        issues_df.to_excel(writer, sheet_name='Quality_Issues', index=False)
        sql_df.to_excel(writer, sheet_name='SQL_Checks', index=False)
        sensitive_compliance_df.to_excel(writer, sheet_name='sensitive_data_masking', index=False)

    # Print summary
    print(f"\nResults saved to {output_file} and {excel_output}")
    print(f"True anomalies found: {sum(1 for a in analysis_results['verified_anomalies'] if a['is_genuine'])}")
    print(f"Quality issues found: {len(analysis_results['quality_issues'])}")

Analyzing anomalies...

Results saved to PATIENT_ADMISSIONS_analysis.json and PATIENT_ADMISSIONS_analysis.xlsx
True anomalies found: 6
Quality issues found: 5


In [39]:
print(df.sample(n=5).to_string())

                  NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION DATE_OF_ADMISSION            DOCTOR                   HOSPITAL INSURANCE_PROVIDER  BILLING_AMOUNT  ROOM_NUMBER ADMISSION_TYPE DISCHARGE_DATE MEDICATION  TEST_RESULTS  ensemble_anomaly_score  is_anomaly
41735    SamueL MOlInA   26    Male        AB-            Cancer        2021-11-18     Joshua Howard                Case-Pierce              Cigna        49975.17          471         Urgent     2021-11-21    Lipitor      Abnormal                0.913786       False
17665  JEfFRey RoberTS   38    Male         B+          Diabetes        2022-01-08       David Davis                  PLC Garza           Medicare        50604.73          263         Urgent     2022-01-14    Aspirin  Inconclusive                0.914370       False
41343          LEe KIM   58    Male         O-            Cancer        2023-02-08  Kathryn Galloway                Inc Wilkins              Aetna        39310.61          342         Urgent     2023