In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
from sklearn.metrics import make_scorer
warnings.filterwarnings("ignore")
load_dotenv(override=True)

class AnomalyDetector:
    def __init__(self, contamination=0.001):
        self.contamination = contamination
        self.models = {
            'isolation_forest': None,
            'local_outlier_factor': None,
            'robust_covariance': None
        }
        self.best_params = {}
        
    def prepare_data(self, df):
        """Prepare data for anomaly detection"""
        df_encoded = df.copy()
        
        # Encode categorical columns
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_cols:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])
        
        # Scale numerical features
        scaler = StandardScaler()
        df_encoded = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)
        # display(df_encoded.head())
        return df_encoded.fillna(0)

    def grid_search_isolation_forest(self, X):
        """Perform grid search for IsolationForest"""
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_samples': [0.3, 0.5, 0.9],
            'contamination': ['auto', 0.0002, 0.0005],
            'max_features': [0.5, 0.7, 1.0]
        }
        
        base_model = IsolationForest(random_state=42)
        
        # Custom scorer for anomaly detection
        scorer = make_scorer(lambda y_true, y_pred: -np.mean(np.square(y_pred)), greater_is_better=False)
        
        grid_search = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=3,
            scoring=scorer,
            n_jobs=-1
        )
        
        grid_search.fit(X)
        self.best_params['isolation_forest'] = grid_search.best_params_
        return grid_search.best_estimator_

    def detect_anomalies(self, df):
        """Detect anomalies using multiple methods"""
        X = self.prepare_data(df)
        
        # Initialize results dictionary
        results = {
            'scores': {},
            'labels': {},
            'ensemble_score': None,
            'ensemble_label': None
        }
        
        # 1. Isolation Forest with Grid Search
        print("Training Isolation Forest with Grid Search...")
        self.models['isolation_forest'] = self.grid_search_isolation_forest(X)
        results['scores']['isolation_forest'] = self.models['isolation_forest'].decision_function(X)
        results['labels']['isolation_forest'] = self.models['isolation_forest'].predict(X)
        
        # 2. Local Outlier Factor
        print("Training Local Outlier Factor...")
        self.models['local_outlier_factor'] = LocalOutlierFactor(
            contamination=self.contamination,
            n_neighbors=20,
            n_jobs=-1
        )
        results['labels']['lof'] = self.models['local_outlier_factor'].fit_predict(X)
        results['scores']['lof'] = self.models['local_outlier_factor'].negative_outlier_factor_
        
        # 3. Robust Covariance (Elliptic Envelope)
        print("Training Robust Covariance...")
        self.models['robust_covariance'] = EllipticEnvelope(
            contamination=self.contamination,
            random_state=42
        )
        results['labels']['robust_covariance'] = self.models['robust_covariance'].fit_predict(X)
        results['scores']['robust_covariance'] = self.models['robust_covariance'].decision_function(X)
        
        # Ensemble scoring
        results['ensemble_score'] = np.mean([
            MinMaxScaler().fit_transform(results['scores'][model].reshape(-1, 1)).flatten()
            for model in results['scores']
        ], axis=0)
        
        # Ensemble labels (majority voting)
        results['ensemble_label'] = np.mean([
            results['labels'][model] == -1 for model in results['labels']
        ], axis=0) >= 0.5
        
        return results

def main():
    # Snowflake connection setup
    SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
    SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
    SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
    SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
    SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
    SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

    connection_string = (
        f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
        f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
        f"?warehouse={SNOWFLAKE_WAREHOUSE}"
    )
    
    engine = create_engine(connection_string)
    print("Snowflake connection established")

    # Get table to analyze
    table_to_analyze = "PATIENT_ADMISSIONS"
    
    # Load data
    query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
    with engine.connect() as conn:
        df = pd.read_sql(query, conn.connection)
    
    print(f"Loaded {len(df)} rows from {table_to_analyze}")
    
    # Initialize and run anomaly detection
    detector = AnomalyDetector(contamination=0.0001)
    results = detector.detect_anomalies(df)
    
    # Add results to original dataframe
    df['ensemble_anomaly_score'] = results['ensemble_score']
    df['is_anomaly'] = results['ensemble_label']
    
    # Get anomalies and sort by ensemble score
    anomalies = df[df['is_anomaly']].sort_values('ensemble_anomaly_score')
    
    print(f"\nFound {len(anomalies)} anomalous records out of {len(df)} total records")
    
    if len(anomalies) > 0:
        print("\nTop 5 most anomalous records:")
        print(anomalies.head())
        
        # Save anomalies to CSV
        output_file = f"{table_to_analyze}_anomalies_enhanced.csv"
        anomalies.to_csv(output_file, index=False)
        print(f"\nSaved anomalies to {output_file}")
        
        # Print best parameters from grid search
        print("\nBest parameters for Isolation Forest:")
        print(detector.best_params['isolation_forest'])
    else:
        print("No anomalies detected in this table")
    
    engine.dispose()
    print("Connection closed")

if __name__ == "__main__":
    main()

Snowflake connection established
Loaded 55501 rows from PATIENT_ADMISSIONS
Training Isolation Forest with Grid Search...
Training Local Outlier Factor...
Training Robust Covariance...

Found 11 anomalous records out of 55501 total records

Top 5 most anomalous records:
                    NAME  AGE  GENDER BLOOD_TYPE MEDICAL_CONDITION  \
3306      ALBerT wALteRs   80  Female         A+         Arthritis   
49585     mATtHeW cARTeR   19    Male         O-         Arthritis   
54671         jaDe BaKER   88  Female         O-           Obesity   
19729      BRAdlEY bLAiR   19    Male         O-         Arthritis   
28370  pHYLlIs florEs Md   77    Male         O-         Arthritis   

      DATE_OF_ADMISSION          DOCTOR                  HOSPITAL  \
3306         2019-05-11  Shelby Walters    Cox Stone, and Merritt   
49585        2023-06-20   Willie Carter            Fischer-Garner   
54671        2021-05-31      Todd Evans  and Gray Smith, Jennings   
19729        2020-01-18     Anita