<a href="https://colab.research.google.com/github/ketanp23/scsd-ddm-class/blob/main/gmm_anomaly_detection_app_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import joblib
from flask import Flask, request, jsonify
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# --- Constants ---
MODEL_DIR = "model_artifacts"
MODEL_FILE = os.path.join(MODEL_DIR, "gmm_model.pkl")
SCALER_FILE = os.path.join(MODEL_DIR, "scaler.pkl")
THRESHOLD_FILE = os.path.join(MODEL_DIR, "threshold.pkl")
ANOMALY_THRESHOLD_PERCENTILE = 99.0  # We'll flag anything "weirder" than 99% of normal data

# --- 1. Dataset Generation ---

def generate_data(n_normal=1000, n_anomalies=50):
    """
    Generates a synthetic 2D dataset with two normal clusters and sparse anomalies.

    Returns:
        X (np.ndarray): Feature data (n_samples, 2)
        y (np.ndarray): Labels (0 for normal, 1 for anomaly)
    """
    print("Generating synthetic data...")
    # Cluster 1 (Normal)
    cluster1 = np.random.multivariate_normal(
        mean=[2, 3],
        cov=[[1, 0.5], [0.5, 1]],
        size=n_normal // 2
    )
    # Cluster 2 (Normal)
    cluster2 = np.random.multivariate_normal(
        mean=[-2, -3],
        cov=[[1, -0.3], [-0.3, 0.8]],
        size=n_normal // 2
    )

    # Combine normal data
    X_normal = np.vstack([cluster1, cluster2])
    y_normal = np.zeros(n_normal)

    # Anomalies (sparse, from a wide uniform distribution)
    X_anomalies = np.random.uniform(low=-15, high=15, size=(n_anomalies, 2))
    y_anomalies = np.ones(n_anomalies)

    # Combine all data
    X = np.vstack([X_normal, X_anomalies])
    y = np.concatenate([y_normal, y_anomalies])

    # Shuffle the dataset
    indices = np.arange(len(y))
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]

    return X, y

# --- 2. Model Training ---

def train_and_save_model(X_train, y_train):
    """
    Trains the GMM anomaly detection model and saves artifacts.

    We follow a semi-supervised approach:
    1. Scale ALL data.
    2. Train the GMM *only* on known "normal" data (y_train == 0).
    3. Determine the anomaly threshold based on the scores of this normal data.
    4. Save the scaler, model, and threshold.
    """
    print("Starting model training...")

    # Ensure artifact directory exists
    os.makedirs(MODEL_DIR, exist_ok=True)

    # 1. Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # 2. Isolate normal data for training
    # This is a key step for this type of anomaly detection
    X_normal_scaled = X_train_scaled[y_train == 0]

    if len(X_normal_scaled) == 0:
        raise ValueError("No 'normal' data (y_train == 0) found in the training set.")

    print(f"Training GMM on {len(X_normal_scaled)} 'normal' samples...")

    # We choose 2 components, matching our data generation
    gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
    gmm.fit(X_normal_scaled)

    # 3. Determine the anomaly threshold
    # The anomaly score is the *negative* log-likelihood.
    # Higher score = more anomalous.
    normal_scores = -gmm.score_samples(X_normal_scaled)

    # Set the threshold at a high percentile of the "normal" scores
    threshold = np.percentile(normal_scores, ANOMALY_THRESHOLD_PERCENTILE)

    print(f"Anomaly threshold (99th percentile of normal data) set to: {threshold:.4f}")

    # 4. Save artifacts
    joblib.dump(gmm, MODEL_FILE)
    print(f"Saved model to {MODEL_FILE}")

    joblib.dump(scaler, SCALER_FILE)
    print(f"Saved scaler to {SCALER_FILE}")

    joblib.dump(threshold, THRESHOLD_FILE)
    print(f"Saved threshold to {THRESHOLD_FILE}")

    return gmm, scaler, threshold

# --- 3. Model Evaluation ---

def evaluate_model(gmm, scaler, threshold, X_test, y_test):
    """
    Evaluates the model on the held-out test set.
    """
    print("Evaluating model on test set...")

    # Scale test data
    X_test_scaled = scaler.transform(X_test)

    # Calculate anomaly scores
    test_scores = -gmm.score_samples(X_test_scaled)

    # Classify as anomaly (1) or normal (0)
    y_pred = (test_scores > threshold).astype(int)

    print("\n--- Model Evaluation Report ---")
    print(f"Test samples: {len(y_test)}")
    print(f"Predicted anomalies (score > {threshold:.4f}): {np.sum(y_pred)}")
    print(f"Actual anomalies in test set: {np.sum(y_test)}")

    # Print classification report
    # Note: '1' is the 'anomaly' class
    print(classification_report(y_test, y_pred, target_names=['Normal (0)', 'Anomaly (1)']))
    print("---------------------------------")


# --- 4. Flask API Serving ---

app = Flask(__name__)

# Load all model artifacts into memory on startup
try:
    print("Loading model artifacts for API...")
    gmm_model = joblib.load(MODEL_FILE)
    scaler_model = joblib.load(SCALER_FILE)
    threshold_value = joblib.load(THRESHOLD_FILE)
    print("Model artifacts loaded successfully.")
except FileNotFoundError:
    print("\n*** WARNING: Model artifacts not found! ***")
    print("Please run this script directly (python gmm_anomaly_detection_app.py)")
    print("to train the model and create the files before starting the API.")
    gmm_model = None
    scaler_model = None
    threshold_value = None

@app.route('/')
def home():
    """Health check endpoint."""
    return jsonify({"message": "GMM Anomaly Detection API is running!"})

@app.route('/predict', methods=['POST'])
def predict_anomaly():
    """
    Main prediction endpoint.
    Expects JSON: {"features": [x, y]}
    """
    if not gmm_model:
        return jsonify({"error": "Model is not loaded. Train the model by running the script."}), 500

    try:
        data = request.get_json()

        if 'features' not in data:
            return jsonify({"error": "Missing 'features' key in JSON payload."}), 400

        features = data['features']

        if not isinstance(features, list) or len(features) != 2:
            return jsonify({"error": "Features must be a list of 2 numbers, e.g., [1.2, 3.4]"}), 400

        # 1. Convert to numpy array
        point = np.array(features).reshape(1, -1)

        # 2. Scale using the *loaded* scaler
        point_scaled = scaler_model.transform(point)

        # 3. Calculate anomaly score
        score = -gmm_model.score_samples(point_scaled)[0]

        # 4. Compare with threshold
        is_anomaly = bool(score > threshold_value)

        # 5. Return result
        return jsonify({
            "is_anomaly": is_anomaly,
            "anomaly_score": score,
            "threshold": threshold_value,
            "input_features": features
        })

    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500

# --- 5. Main Execution (Train then Serve) ---

if __name__ == '__main__':
    # --- Part 1: Train and Evaluate ---

    # 1. Generate Data
    X, y = generate_data()

    # 2. Split data (stratify to keep anomaly ratio consistent)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # 3. Train and Save Model
    # We pass y_train so the function can isolate the "normal" data
    gmm, scaler, threshold = train_and_save_model(X_train, y_train)

    # 4. Evaluate Model
    evaluate_model(gmm, scaler, threshold, X_test, y_test)

    # --- Part 2: Start the API ---
    print("\n--- Starting Flask API Server ---")
    print("The server is now running.")
    print("You can send POST requests to http://127.0.0.1:5000/predict")
    print("\nExample (Normal Point):")
    print("curl -X POST -H \"Content-Type: application/json\" -d '{\"features\": [2, 2]}' http://127.0.0.1:5000/predict")
    print("\nExample (Anomaly Point):")
    print("curl -X POST -H \"Content-Type: application/json\" -d '{\"features\": [10, 10]}' http://127.0.0.1:5000/predict")

    # Reload the models in the global scope for the app
    # (This ensures the app context has the newly trained models)
    gmm_model = joblib.load(MODEL_FILE)
    scaler_model = joblib.load(SCALER_FILE)
    threshold_value = joblib.load(THRESHOLD_FILE)
    print("\nGlobal API models reloaded with new training data.")

    # Run the Flask app
    # `debug=False` is safer for a "production" example,
    # but `debug=True` is fine for development.
    app.run(host='0.0.0.0', port=5000, debug=False)

Loading model artifacts for API...

Please run this script directly (python gmm_anomaly_detection_app.py)
to train the model and create the files before starting the API.
Generating synthetic data...
Starting model training...
Training GMM on 800 'normal' samples...
Anomaly threshold (99th percentile of normal data) set to: 4.4839
Saved model to model_artifacts/gmm_model.pkl
Saved scaler to model_artifacts/scaler.pkl
Saved threshold to model_artifacts/threshold.pkl
Evaluating model on test set...

--- Model Evaluation Report ---
Test samples: 210
Predicted anomalies (score > 4.4839): 11
Actual anomalies in test set: 10.0
              precision    recall  f1-score   support

  Normal (0)       1.00      0.99      1.00       200
 Anomaly (1)       0.91      1.00      0.95        10

    accuracy                           1.00       210
   macro avg       0.95      1.00      0.97       210
weighted avg       1.00      1.00      1.00       210

---------------------------------

--- Start

 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
