In [5]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

# Step 1: Configure Snowflake connection parameters
SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

print("Snowflake configuration loaded")

# Step 2: Create Snowflake connection
connection_string = (
    f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
    f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
    f"?warehouse={SNOWFLAKE_WAREHOUSE}"
)
engine = create_engine(connection_string)
print("Snowflake connection established")

# Step 3: Get list of tables from Snowflake schema
query = f"""
SELECT 
    t.TABLE_NAME
FROM {SNOWFLAKE_DATABASE}.INFORMATION_SCHEMA.TABLES t 
WHERE t.TABLE_TYPE = 'BASE TABLE' 
AND t.TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA}'
"""
conn = engine.connect()
tables_df = pd.read_sql(query, conn.connection)
table_names = tables_df['TABLE_NAME'].tolist()
print(f"Found {len(table_names)} tables in schema")

# Step 4: Set the table to analyze
table_to_analyze = "PATIENT_ADMISSIONS"  # Replace with your table name

# Step 5: Get table data
query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
df = pd.read_sql(query, conn.connection)
print(f"Loaded {len(df)} rows from {table_to_analyze}")

# Step 6: Prepare data for anomaly detection
print("Preparing data for anomaly detection")

# Make a copy of the dataframe for encoding
df_encoded = df.copy()

# Identify categorical columns (object and category dtypes)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Found {len(categorical_cols)} categorical columns")

# Encode categorical columns - THIS IS THE FIX
if categorical_cols:
    # Create encoder with correct parameters BEFORE using it
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # Then fit and transform
    df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])

# Fill any remaining NaNs with 0
df_encoded = df_encoded.fillna(0)

print(f"Data encoded with {df_encoded.shape[1]} features")

# Display sample of encoded data
print("Sample of encoded data:")
display(df_encoded.head(2))

# Step 7: Train anomaly detection model
print("Training Isolation Forest model")
model = IsolationForest(
    contamination=0.001,
    random_state=42,
    n_estimators=100
)

model.fit(df_encoded)

# Step 8: Predict anomalies
print("Detecting anomalies")
anomaly_scores = model.decision_function(df_encoded)
anomaly_labels = model.predict(df_encoded)

# Step 9: Add scores and anomaly flags to original data
df['anomaly_score'] = anomaly_scores
df['is_anomaly'] = anomaly_labels == -1

# Get anomalies and sort by score
anomalies = df[df['is_anomaly']].sort_values('anomaly_score')

print(f"Found {len(anomalies)} anomalous records out of {len(df)} total records")

display(anomalies)


if len(anomalies) > 0:
    print("\nSample of anomalous records (top 5 most anomalous):")
    print(anomalies.head())
    
    # Step 10: Save anomalies to CSV
    output_file = f"{table_to_analyze}_anomalies.csv"
    anomalies.to_csv(output_file, index=False)
    print(f"Saved anomalies to {output_file}")
else:
    print("No anomalies detected in this table")

# Close connection when done
conn.close()
engine.dispose()
print("Connection closed")

Snowflake configuration loaded
Snowflake connection established
Found 1 tables in schema
Loaded 55510 rows from PATIENT_ADMISSIONS
Preparing data for anomaly detection
Found 12 categorical columns
Data encoded with 15 features
Sample of encoded data:


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,48729.0,55,2.0,9.0,5.0,1830.0,11862.0,3644.0,4.0,2000.5,305,0.0,1858.0,6.0,5.0
1,48731.0,60,0.0,7.0,4.0,1831.0,11755.0,4812.0,6.0,1800.0,401,1.0,1859.0,3.0,6.0


Training Isolation Forest model
Detecting anomalies
Found 56 anomalous records out of 55510 total records


Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS,anomaly_score,is_anomaly
3315,ALBerT wALteRs,80,Female,A+,Arthritis,2019-05-11,Shelby Walters,"Cox Stone, and Merritt",Medicare,4699.31,108,Elective,2019-06-09,Penicillin,Abnormal,-0.02957,True
35880,laUrEn MarTinEz DVM,85,Male,A+,Asthma,2024-04-20,Andrew Rogers,Martin-Hamilton,UnitedHealthcare,25339.95,498,Urgent,2024-04-28,Penicillin,Abnormal,-0.018371,True
20786,ALliSon peTers,74,Female,A-,Obesity,2020-03-24,Amanda Griffin,"Thompson Wiggins and Frazier,",UnitedHealthcare,1962.63,486,Urgent,2020-03-27,Penicillin,Abnormal,-0.014952,True
33136,Ann raNDall,85,Female,B-,Asthma,2019-05-23,Tiffany Bender,Fox-Rocha,UnitedHealthcare,23861.17,426,Urgent,2019-06-05,Penicillin,Abnormal,-0.01327,True
12551,pATrICk gUTiERrez dvM,21,Female,O-,Obesity,2023-12-05,William Lynch,Johnson Inc,UnitedHealthcare,49577.49,438,Elective,2023-12-20,Aspirin,Abnormal,-0.011181,True
785,BRian halL,51,Female,O+,Arthritis,2024-02-21,Ryan Rich,Hickman Group,UnitedHealthcare,5164.55,470,Urgent,2024-03-19,Paracetamol,Abnormal,-0.010733,True
20592,JEsSICa MITcHell,71,Female,A+,Arthritis,2024-04-08,Shawn Santos,Sons Cooper and,Medicare,47411.59,429,Urgent,2024-04-15,Penicillin,Abnormal,-0.010542,True
24004,cYNthia BrOWn,21,Female,A-,Obesity,2019-05-16,Matthew Stewart,Booker-Mitchell,Aetna,45800.39,490,Elective,2019-05-30,Penicillin,Abnormal,-0.010351,True
50612,cYNthia BrOWn,21,Female,A-,Obesity,2019-05-16,Matthew Stewart,Booker-Mitchell,Aetna,45800.39,490,Elective,2019-05-30,Penicillin,Abnormal,-0.010351,True
3065,lAURa WIllIAmS,33,Female,A+,Hypertension,2019-05-16,Vincent Ramirez,Hood-Reed,UnitedHealthcare,6094.12,168,Elective,2019-06-15,Penicillin,Normal,-0.009041,True



Sample of anomalous records (top 5 most anomalous):
                        NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION  \
3315          ALBerT wALteRs   80  Female         A+         Arthritis   
35880    laUrEn MarTinEz DVM   85    Male         A+            Asthma   
20786         ALliSon peTers   74  Female         A-           Obesity   
33136            Ann raNDall   85  Female         B-            Asthma   
12551  pATrICk gUTiERrez dvM   21  Female         O-           Obesity   

      DATE_OF_ADMISSION          DOCTOR                       HOSPITAL  \
3315         2019-05-11  Shelby Walters         Cox Stone, and Merritt   
35880        2024-04-20   Andrew Rogers                Martin-Hamilton   
20786        2020-03-24  Amanda Griffin  Thompson Wiggins and Frazier,   
33136        2019-05-23  Tiffany Bender                      Fox-Rocha   
12551        2023-12-05   William Lynch                    Johnson Inc   

      INSURANCE_PROVIDER  BILLING_AMOUNT  ROOM_NUMBER ADM

## GRID SEARCH CV

In [13]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
from sklearn.metrics import make_scorer
warnings.filterwarnings("ignore")
load_dotenv(override=True)

class AnomalyDetector:
    def __init__(self, contamination=0.001):
        self.contamination = contamination
        self.models = {
            'isolation_forest': None,
            'local_outlier_factor': None,
            'robust_covariance': None
        }
        self.best_params = {}
        
    def prepare_data(self, df):
        """Prepare data for anomaly detection"""
        df_encoded = df.copy()
        
        # Encode categorical columns
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_cols:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])
        
        # Scale numerical features
        scaler = StandardScaler()
        df_encoded = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)
        # display(df_encoded.head())
        return df_encoded.fillna(0)

    def grid_search_isolation_forest(self, X):
        """Perform grid search for IsolationForest"""
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_samples': [0.3, 0.5, 0.9],
            'contamination': ['auto', 0.0002, 0.0005],
            'max_features': [0.5, 0.7, 1.0]
        }
        
        base_model = IsolationForest(random_state=42)
        
        # Custom scorer for anomaly detection
        scorer = make_scorer(lambda y_true, y_pred: -np.mean(np.square(y_pred)), greater_is_better=False)
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=3,
            scoring=scorer,
            n_jobs=-1
        )
        
        grid_search.fit(X)
        self.best_params['isolation_forest'] = grid_search.best_params_
        return grid_search.best_estimator_

    def detect_anomalies(self, df):
        """Detect anomalies using multiple methods"""
        X = self.prepare_data(df)
        
        # Initialize results dictionary
        results = {
            'scores': {},
            'labels': {},
            'ensemble_score': None,
            'ensemble_label': None
        }
        
        # 1. Isolation Forest with Grid Search
        print("Training Isolation Forest with Grid Search...")
        self.models['isolation_forest'] = self.grid_search_isolation_forest(X)
        results['scores']['isolation_forest'] = self.models['isolation_forest'].decision_function(X)
        results['labels']['isolation_forest'] = self.models['isolation_forest'].predict(X)
        
        # 2. Local Outlier Factor
        print("Training Local Outlier Factor...")
        self.models['local_outlier_factor'] = LocalOutlierFactor(
            contamination=self.contamination,
            n_neighbors=20,
            n_jobs=-1
        )
        results['labels']['lof'] = self.models['local_outlier_factor'].fit_predict(X)
        results['scores']['lof'] = self.models['local_outlier_factor'].negative_outlier_factor_
        
        # 3. Robust Covariance (Elliptic Envelope)
        print("Training Robust Covariance...")
        self.models['robust_covariance'] = EllipticEnvelope(
            contamination=self.contamination,
            random_state=42
        )
        results['labels']['robust_covariance'] = self.models['robust_covariance'].fit_predict(X)
        results['scores']['robust_covariance'] = self.models['robust_covariance'].decision_function(X)
        
        # Ensemble scoring
        results['ensemble_score'] = np.mean([
            MinMaxScaler().fit_transform(results['scores'][model].reshape(-1, 1)).flatten()
            for model in results['scores']
        ], axis=0)
        
        # Ensemble labels (majority voting)
        results['ensemble_label'] = np.mean([
            results['labels'][model] == -1 for model in results['labels']
        ], axis=0) >= 0.5
        
        return results

def main():
    # Snowflake connection setup
    SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
    SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
    SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
    SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
    SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
    SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

    connection_string = (
        f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
        f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
        f"?warehouse={SNOWFLAKE_WAREHOUSE}"
    )
    
    engine = create_engine(connection_string)
    print("Snowflake connection established")

    # Get table to analyze
    table_to_analyze = "PATIENT_ADMISSIONS"
    
    # Load data
    query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
    
    print(f"Loaded {len(df)} rows from {table_to_analyze}")
    
    # Initialize and run anomaly detection
    detector = AnomalyDetector(contamination=0.0001)
    results = detector.detect_anomalies(df)
    
    # Add results to original dataframe
    df['ensemble_anomaly_score'] = results['ensemble_score']
    df['is_anomaly'] = results['ensemble_label']
    
    # Get anomalies and sort by ensemble score
    anomalies = df[df['is_anomaly']].sort_values('ensemble_anomaly_score')
    
    print(f"\nFound {len(anomalies)} anomalous records out of {len(df)} total records")
    
    if len(anomalies) > 0:
        print("\nTop 5 most anomalous records:")
        print(anomalies.head())
        
        # Save anomalies to CSV
        output_file = f"{table_to_analyze}_anomalies_enhanced.csv"
        anomalies.to_csv(output_file, index=False)
        print(f"\nSaved anomalies to {output_file}")
        
        # Print best parameters from grid search
        print("\nBest parameters for Isolation Forest:")
        print(detector.best_params['isolation_forest'])
    else:
        print("No anomalies detected in this table")
    
    engine.dispose()
    print("Connection closed")

if __name__ == "__main__":
    main()

Snowflake connection established
Loaded 55514 rows from PATIENT_ADMISSIONS
Training Isolation Forest with Grid Search...
Training Local Outlier Factor...
Training Robust Covariance...

Found 7 anomalous records out of 55514 total records

Top 5 most anomalous records:
      name  age gender blood_type    medical_condition date_of_admission  \
9   test_2   45      F         AA              perfect        2008-02-20   
13  test_5   40   Male             high blood pressure        2000-02-20   
8   test_1  450      M        abc       perfectly fine        2025-02-20   
11  test_4  400      F         AA  high blood pressure        2025-02-20   
10  test_3  450      M        XYZ         Hypertension        2025-02-20   

       doctor       hospital insurance_provider  billing_amount  room_number  \
9   Dr. Smith  City Hospital    HealthCare Inc.         1500.75          203   
13  Dr. Smith  City Hospital    HealthCare Inc.         1500.75            2   
8   Dr. Smith  City Hospital    He

In [10]:
anomalies.to_csv("PATIENT_ADMISSIONS_anomalies_enhanced.csv", index=False)

In [15]:
display(df_encoded.head())

Unnamed: 0,NAME,AGE,GENDER,BLOOD_TYPE,MEDICAL_CONDITION,DATE_OF_ADMISSION,DOCTOR,HOSPITAL,INSURANCE_PROVIDER,BILLING_AMOUNT,ROOM_NUMBER,ADMISSION_TYPE,DISCHARGE_DATE,MEDICATION,TEST_RESULTS
0,48729.0,55,2.0,9.0,5.0,1830.0,11862.0,3644.0,4.0,2000.5,305,0.0,1858.0,6.0,5.0
1,48731.0,60,0.0,7.0,4.0,1831.0,11755.0,4812.0,6.0,1800.0,401,1.0,1859.0,3.0,6.0
2,45321.0,70,2.0,4.0,8.0,1832.0,11606.0,9255.0,0.0,2500.99,502,5.0,1860.0,8.0,2.0
3,19296.0,45,0.0,2.0,2.0,1833.0,11873.0,31854.0,7.0,3100.75,601,2.0,1861.0,10.0,0.0
4,48724.0,450,2.0,12.0,12.0,1829.0,11863.0,4812.0,4.0,1500.75,203,4.0,1856.0,0.0,4.0
