In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder
import snowflake.connector
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from dotenv import load_dotenv
import os
import time
import warnings
warnings.filterwarnings("ignore")
load_dotenv(override=True)

# Step 1: Configure Snowflake connection parameters
SNOWFLAKE_USER = os.environ.get("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = os.environ.get("SNOWFLAKE_PASSWORD")
SNOWFLAKE_ACCOUNT = os.environ.get("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_WAREHOUSE = os.environ.get("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE = os.environ.get("SNOWFLAKE_DATABASE")
SNOWFLAKE_SCHEMA = os.environ.get("SNOWFLAKE_SCHEMA")

print("Snowflake configuration loaded")

# Step 2: Create Snowflake connection
connection_string = (
    f"snowflake://{SNOWFLAKE_USER}:{SNOWFLAKE_PASSWORD}@"
    f"{SNOWFLAKE_ACCOUNT}/{SNOWFLAKE_DATABASE}/{SNOWFLAKE_SCHEMA}"
    f"?warehouse={SNOWFLAKE_WAREHOUSE}"
)
engine = create_engine(connection_string)
print("Snowflake connection established")

# Step 3: Get list of tables from Snowflake schema
query = f"""
SELECT 
    t.TABLE_NAME
FROM {SNOWFLAKE_DATABASE}.INFORMATION_SCHEMA.TABLES t 
WHERE t.TABLE_TYPE = 'BASE TABLE' 
AND t.TABLE_SCHEMA = '{SNOWFLAKE_SCHEMA}'
"""
conn = engine.connect()
tables_df = pd.read_sql(query, conn.connection)
table_names = tables_df['TABLE_NAME'].tolist()
print(f"Found {len(table_names)} tables in schema")

# Step 4: Set the table to analyze
table_to_analyze = "PATIENT_ADMISSIONS"  # Replace with your table name

# Step 5: Get table data
query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.{table_to_analyze}"
df = pd.read_sql(query, conn.connection)
print(f"Loaded {len(df)} rows from {table_to_analyze}")

# Step 6: Prepare data for anomaly detection
print("Preparing data for anomaly detection")

# Make a copy of the dataframe for encoding
df_encoded = df.copy()

# Identify categorical columns (object and category dtypes)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Found {len(categorical_cols)} categorical columns")

# Encode categorical columns - THIS IS THE FIX
if categorical_cols:
    # Create encoder with correct parameters BEFORE using it
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # Then fit and transform
    df_encoded[categorical_cols] = encoder.fit_transform(df[categorical_cols])

# Fill any remaining NaNs with 0
df_encoded = df_encoded.fillna(0)

print(f"Data encoded with {df_encoded.shape[1]} features")

# Display sample of encoded data
print("Sample of encoded data:")
display(df_encoded.head(2))

# Step 7: Train anomaly detection model
print("Training Isolation Forest model")
model = IsolationForest(
    contamination=0.01,
    random_state=42,
    n_estimators=300
)

model.fit(df_encoded)

# Step 8: Predict anomalies
print("Detecting anomalies")
anomaly_scores = model.decision_function(df_encoded)
anomaly_labels = model.predict(df_encoded)

# Step 9: Add scores and anomaly flags to original data
df['anomaly_score'] = anomaly_scores
df['is_anomaly'] = anomaly_labels == -1

# Get anomalies and sort by score
anomalies = df[df['is_anomaly']].sort_values('anomaly_score')

print(f"Found {len(anomalies)} anomalous records out of {len(df)} total records")

display(anomalies)


if len(anomalies) > 0:
    print("\nSample of anomalous records (top 5 most anomalous):")
    print(anomalies.head())
    
    # Step 10: Save anomalies to CSV
    output_file = f"{table_to_analyze}_anomalies.csv"
    anomalies.to_csv(output_file, index=False)
    print(f"Saved anomalies to {output_file}")
else:
    print("No anomalies detected in this table")

# Close connection when done
conn.close()
engine.dispose()
print("Connection closed")