**sample data set downloaded from this **
[link text](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data)

**install required lib**

In [None]:
!pip install -q kaggle
from google.colab import userdata
import os

os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

In [None]:
gdrive_path="/content/drive/MyDrive/Anamolies/CATS_dataset"

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from typing import Dict, List, Optional, Any, Union
import os
import matplotlib.pyplot as plt

**Read the sample data**

In [None]:
df=pd.read_csv(gdrive_path+"/data.csv")

In [None]:
df.columns

Index(['timestamp', 'aimp', 'amud', 'arnd', 'asin1', 'asin2', 'adbr', 'adfl',
       'bed1', 'bed2', 'bfo1', 'bfo2', 'bso1', 'bso2', 'bso3', 'ced1', 'cfo1',
       'cso1', 'y', 'category'],
      dtype='object')

**Profiling**

In [None]:
class DataQualityProfiler:
    """
    Dataset-level DQ metrics for anomaly detection
    Approach: Column-wise metrics per day → detect corrupt datasets (not individual accounts)
    """

    def __init__(self, df):
        print("Initializing DataQualityProfiler...")
        self.df = df.copy()
        self.numeric_cols = ['aimp', 'amud', 'arnd', 'asin1', 'asin2', 'adbr', 'adfl',
                            'bed1', 'bed2', 'bfo1', 'bfo2', 'bso1', 'bso2', 'bso3',
                            'ced1', 'cfo1', 'cso1', 'y', 'category']
        print(f"Initialized with {len(self.df)} records, {len(self.numeric_cols)} numeric columns")

    def prepare_timestamp(self, timestamp_col='timestamp'):
        """Convert timestamp to yyyy-mm-dd format"""
        print(f"Converting {timestamp_col} to yyyy-mm-dd format...")

        if timestamp_col in self.df.columns:
            self.df[timestamp_col] = pd.to_datetime(self.df[timestamp_col]).dt.date
            unique_dates = self.df[timestamp_col].nunique()
            print(f"Converted timestamps - Found {unique_dates} unique dates")
        else:
            print("No timestamp column found, creating synthetic dates")

        return self.df[timestamp_col].unique() if timestamp_col in self.df.columns else None

    def calculate_basic_metrics(self, series, col_name, date):
        """Calculate basic statistical metrics with logging"""
        print(f"Basic metrics for {col_name} on {date}")

        return {
            'Mean': series.mean(),
            'Minimum': series.min(),
            'Maximum': series.max(),
            'StandardDeviation': series.std(),
            'Sum': series.sum(),
            'count': len(series),
            'Completeness': series.notna().sum() / len(series),
            'missingCount': series.isna().sum(),
        }

    def calculate_distribution_metrics(self, series, col_name, date):
        """Calculate distribution-based metrics with logging"""
        print(f"Distribution metrics for {col_name} on {date}")

        clean_series = series.dropna()

        # Histogram calculation
        try:
            hist_counts, _ = np.histogram(clean_series, bins=10)
            histogram_str = ','.join(map(str, hist_counts))
        except:
            histogram_str = "0,0,0,0,0,0,0,0,0,0"

        # Data type inference
        if pd.api.types.is_numeric_dtype(series):
            data_type = "INTEGER" if pd.api.types.is_integer_dtype(series) else "DECIMAL"
        elif pd.api.types.is_datetime64_any_dtype(series):
            data_type = "TIMESTAMP"
        elif pd.api.types.is_bool_dtype(series):
            data_type = "BOOLEAN"
        else:
            data_type = "STRING"

        return {
            'zerocount': (clean_series == 0).sum(),
            'zeroCount': (clean_series == 0).sum(),
            'zerocountpercentage': ((clean_series == 0).sum() / len(clean_series)) * 100 if len(clean_series) > 0 else 0,
            'negativecount': (clean_series < 0).sum(),
            'negativeCount': (clean_series < 0).sum(),
            'negativecountpercentage': ((clean_series < 0).sum() / len(clean_series)) * 100 if len(clean_series) > 0 else 0,
            'CountDistinct': clean_series.nunique(),
            'Uniqueness': clean_series.nunique() / len(clean_series) if len(clean_series) > 0 else 0,
            'UniqueValueRatio': clean_series.nunique() / len(clean_series) if len(clean_series) > 0 else 0,
            'Distinctness': clean_series.nunique() / len(clean_series) if len(clean_series) > 0 else 0,
            'Histogram': histogram_str,
            'DataType': data_type,
        }

    def calculate_quantiles(self, series, col_name, date):
        """Calculate approximate quantiles with logging"""
        print(f"Quantile metrics for {col_name} on {date}")

        clean_series = series.dropna()
        if len(clean_series) == 0:
            return {
                'ApproxQuantiles_0.1': 0, 'ApproxQuantiles_0.25': 0, 'ApproxQuantiles_0.5': 0,
                'ApproxQuantiles_0.75': 0, 'ApproxQuantiles_0.9': 0
            }

        return {
            'ApproxQuantiles_0.1': clean_series.quantile(0.1),
            'ApproxQuantiles_0.25': clean_series.quantile(0.25),
            'ApproxQuantiles_0.5': clean_series.quantile(0.5),
            'ApproxQuantiles_0.75': clean_series.quantile(0.75),
            'ApproxQuantiles_0.9': clean_series.quantile(0.9),
        }

    def calculate_entropy(self, series, col_name, date):
        """Calculate Shannon entropy with logging"""
        print(f"Entropy calculation for {col_name} on {date}")

        clean_series = series.dropna()
        if len(clean_series) == 0:
            return 0

        try:
            bins = min(50, len(clean_series.unique()))
            hist, _ = np.histogram(clean_series, bins=bins)
            hist = hist[hist > 0]
            probs = hist / hist.sum()
            entropy = -np.sum(probs * np.log2(probs))
            print(f"Entropy calculated: {entropy:.3f}")
            return entropy
        except:
            print(f"Entropy calculation failed for {col_name}")
            return 0

    def calculate_correlation_matrix(self, day_data, date):
        """Calculate correlation matrix for the day with logging"""
        print(f"Correlation matrix for {date}")

        available_cols = [col for col in self.numeric_cols if col in day_data.columns]
        if len(available_cols) < 2:
            return pd.DataFrame()

        corr_matrix = day_data[available_cols].corr()
        print(f"Correlation matrix calculated for {len(available_cols)} columns")
        return corr_matrix

    def calculate_mutual_information(self, col1_data, col2_data, col1, col2, date):
        """Calculate mutual information with logging"""
        print(f"Mutual information: {col1} vs {col2} on {date}")

        try:
            x_binned = pd.cut(col1_data.dropna(), bins=10, labels=False)
            y_binned = pd.cut(col2_data.dropna(), bins=10, labels=False)

            # Align the series
            min_len = min(len(x_binned), len(y_binned))
            x_binned = x_binned[:min_len]
            y_binned = y_binned[:min_len]

            contingency = pd.crosstab(x_binned, y_binned)

            mi = 0
            total = contingency.sum().sum()

            for i in range(contingency.shape[0]):
                for j in range(contingency.shape[1]):
                    if contingency.iloc[i, j] > 0:
                        pxy = contingency.iloc[i, j] / total
                        px = contingency.iloc[i, :].sum() / total
                        py = contingency.iloc[:, j].sum() / total
                        mi += pxy * np.log2(pxy / (px * py))

            print(f"MI calculated: {mi:.3f}")
            return mi
        except Exception as e:
            print(f"MI calculation failed: {str(e)}")
            return 0

    def profile_dataset_daily(self, timestamp_col='timestamp'):
        """
        Profile dataset column-wise per day for dataset-level anomaly detection
        Returns: One row per date with aggregated column metrics
        """
        print("Starting dataset-level daily profiling...")

        # Prepare timestamps
        dates = self.prepare_timestamp(timestamp_col)

        if dates is None:
            print("No valid dates found")
            return pd.DataFrame()

        features_list = []

        for date_idx, date in enumerate(sorted(dates)):
            print(f"Processing date {date_idx + 1}/{len(dates)}: {date}")

            # Get all data for this date
            if timestamp_col in self.df.columns:
                day_data = self.df[self.df[timestamp_col] == date]
            else:
                # Fallback to chunking
                start_idx = date_idx * 1000
                end_idx = min(start_idx + 1000, len(self.df))
                day_data = self.df.iloc[start_idx:end_idx]

            if len(day_data) == 0:
                print(f"No data for {date}")
                continue

            print(f"Processing {len(day_data)} records")

            # Initialize daily features
            daily_features = {
                'date': str(date),
                'total_records': len(day_data),
                'dataset_completeness': day_data.notna().sum().sum() / (len(day_data) * len(self.numeric_cols))
            }

            # Calculate correlation matrix for the day
            corr_matrix = self.calculate_correlation_matrix(day_data, date)

            # Process each column
            for col in self.numeric_cols:
                if col not in day_data.columns:
                    print(f"Column {col} not found")
                    continue

                print(f"Processing column: {col}")
                series = day_data[col]

                # Basic metrics
                basic_metrics = self.calculate_basic_metrics(series, col, date)
                for metric, value in basic_metrics.items():
                    daily_features[f"{col}_{metric}"] = value

                # Distribution metrics
                dist_metrics = self.calculate_distribution_metrics(series, col, date)
                for metric, value in dist_metrics.items():
                    daily_features[f"{col}_{metric}"] = value

                # Quantiles
                quantile_metrics = self.calculate_quantiles(series, col, date)
                for metric, value in quantile_metrics.items():
                    daily_features[f"{col}_{metric}"] = value

                # Entropy
                daily_features[f"{col}_Entropy"] = self.calculate_entropy(series, col, date)

                # Size metrics
                daily_features[f"{col}_Size"] = len(series)
                if series.dtype == 'object':
                    daily_features[f"{col}_MaxLength"] = series.astype(str).str.len().max()
                    daily_features[f"{col}_MinLength"] = series.astype(str).str.len().min()
                else:
                    daily_features[f"{col}_MaxLength"] = 0
                    daily_features[f"{col}_MinLength"] = 0

                # Correlation with other columns
                if not corr_matrix.empty and col in corr_matrix.columns:
                    for other_col in self.numeric_cols[:5]:  # Limit correlations
                        if other_col != col and other_col in corr_matrix.columns:
                            daily_features[f"{col}_Correlation_{other_col}"] = corr_matrix.loc[col, other_col]

                # Mutual information with first column
                if col != self.numeric_cols[0] and self.numeric_cols[0] in day_data.columns:
                    mi_value = self.calculate_mutual_information(
                        day_data[col], day_data[self.numeric_cols[0]], col, self.numeric_cols[0], date
                    )
                    daily_features[f"{col}_MutualInformation"] = mi_value

            features_list.append(daily_features)
            print(f"Completed {date} - Generated {len(daily_features)} features")

            # Limit for demo
            if len(features_list) >= 180:
                break

        result_df = pd.DataFrame(features_list)
        print(f"Profiling complete! Generated {result_df.shape[0]} days × {result_df.shape[1]} features")
        return result_df

**Z score calculation**

In [None]:
def prepare_features_for_model(df, zscore_threshold=3.0, target_cols=None, enhanced=False):
    """Main function to prepare dataset-level features for anomaly detection"""
    print("DATASET-LEVEL ANOMALY DETECTION FEATURE PREPARATION")
    print("=" * 60)

    # Initialize profiler
    profiler = DataQualityProfiler(df)

    # Generate base features (daily dataset profiles)
    print("\n STEP 1: Calculating daily DQ metrics...")
    feature_df = profiler.profile_dataset_daily()

    if feature_df.empty:
        print("No features generated")
        return pd.DataFrame()

    # Add rolling window features with z-scores
    print("\n  STEP 2: Creating rolling features...")
    final_features = create_rolling_features(feature_df)

    # Handle null values for ML models
    print("\n STEP 3: Handling null values...")
    final_features = handle_nulls_for_ml(final_features)

    if enhanced:
        # Enhanced feature engineering for LSTM
        print(f"\n STEP 4: Enhanced feature engineering (threshold={zscore_threshold})...")
        feature_engineer = FeatureEngineer(zscore_threshold=zscore_threshold)
        final_features = feature_engineer.create_features(
            final_features,
            process_date_col='date',
            target_cols=target_cols
        )

    print(f"\nFEATURE PREPARATION COMPLETE!")
    print(f" Final shape: {final_features.shape[0]} days × {final_features.shape[1]} features")
    print(f" Ready for dataset-level anomaly detection")

    return final_features

**Create Lag features**

In [None]:
def create_lag_features(feature_df, lag_features=[1, 2, 3], base_columns_only=True):
    """Create lag features on base columns only (not derived features)"""
    print(f" Creating lag features: {lag_features}")

    feature_df = feature_df.sort_values('date').reset_index(drop=True)

    if base_columns_only:
        # Exclude derived features - only base columns
        derived_suffixes = ['_rolling_', '_zscore_', '_anomaly_', '_diff_', '_ratio_', '_trend_', '_mean', '_std', '_min', '_max']
        base_columns = [col for col in feature_df.columns
                       if col != 'date' and not any(suffix in col for suffix in derived_suffixes)]
    else:
        base_columns = [col for col in feature_df.columns if col != 'date']

    result_df = feature_df.copy()

    for lag in lag_features:
        for col in base_columns:
            if pd.api.types.is_numeric_dtype(feature_df[col]):
                # Create lag feature
                result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
                # Create difference from lag
                result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]

    # Fill NaN values from shifting
    lag_cols = [col for col in result_df.columns if '_lag_' in col]
    result_df[lag_cols] = result_df[lag_cols].fillna(0)

    new_features = len(result_df.columns) - len(feature_df.columns)
    print(f" Added {new_features} lag features")

    return result_df

**Rolling feature**

In [None]:
def create_rolling_features(feature_df, windows=[7, 14, 30, 90]):
    """Create rolling window features for temporal analysis"""
    print(f"\n Creating rolling features with windows: {windows}")

    feature_df = feature_df.sort_values('date').reset_index(drop=True)
    numeric_features = [col for col in feature_df.columns if col not in ['date']]

    # Use list to collect DataFrames, then concat once
    rolling_dfs = [feature_df]

    for window in windows:
        print(f"   Processing {window}-day rolling window...")
        window_features = {}

        for feature in numeric_features:
            if feature_df[feature].dtype in ['float64', 'int64']:
                # Calculate all rolling stats at once
                rolling_data = feature_df[feature].rolling(window)
                rolling_mean = rolling_data.mean()
                rolling_std = rolling_data.std()

                window_features[f"{feature}_rolling_{window}d_mean"] = rolling_mean
                window_features[f"{feature}_rolling_{window}d_std"] = rolling_std
                window_features[f"{feature}_rolling_{window}d_min"] = rolling_data.min()
                window_features[f"{feature}_rolling_{window}d_max"] = rolling_data.max()

                # Z-score features (current vs rolling window)
                current_value = feature_df[feature]
                window_features[f"{feature}_zscore_{window}d"] = (
                    (current_value - rolling_mean) / (rolling_std + 1e-8)
                )

                # Anomaly flags based on z-score threshold
                zscore_col = f"{feature}_zscore_{window}d"
                window_features[f"{feature}_anomaly_{window}d"] = (
                    window_features[zscore_col].abs() > 3.0
                ).astype(int)

                # Trend features
                window_features[f"{feature}_trend_{window}d"] = rolling_data.apply(
                    lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) == window else 0
                )

                # Difference from rolling mean
                window_features[f"{feature}_diff_{window}d"] = current_value - rolling_mean

                # Ratio to rolling mean
                window_features[f"{feature}_ratio_{window}d"] = (
                    current_value / (rolling_mean + 1e-8)
                )

        # Add window features as DataFrame
        rolling_dfs.append(pd.DataFrame(window_features))

    # Concatenate all at once
    rolling_features = pd.concat(rolling_dfs, axis=1)
    print(f" Rolling features created: {rolling_features.shape[1]} total features")
    return rolling_features


**Enhanced feature engineering**

In [None]:
class FeatureEngineer:
    """Enhanced feature engineering for LSTM anomaly detection"""

    def __init__(self, zscore_threshold=3.0):
        self.zscore_threshold = zscore_threshold
        print(f"[INIT] FeatureEngineer with threshold={zscore_threshold}")

    def create_features(self, profile_df, process_date_col='date', target_cols=None):
        """Create LSTM-optimized features"""
        print(f"[FEATURES] Enhancing {profile_df.shape[1]} features for LSTM")

        result_df = profile_df.copy()

        # Find profile columns
        profile_cols = [col for col in profile_df.columns if any(
            suffix in col for suffix in ['_zscore_', '_anomaly_', '_diff_', '_ratio_']
        )]

        # Create combined anomaly scores
        result_df = self._create_combined_scores(result_df, profile_cols)

        # Add time features
        result_df = self._add_time_features(result_df, process_date_col)

        # Target-specific features
        if target_cols:
            result_df = self._create_target_features(result_df, target_cols)

        print(f"[COMPLETE] Added {result_df.shape[1] - profile_df.shape[1]} enhanced features")
        return result_df

    def _create_combined_scores(self, df, profile_cols):
        """Combined anomaly scores across windows"""
        result_df = df.copy()

        # Get base feature names
        base_features = set()
        for col in profile_cols:
            if '_zscore_' in col:
                base_name = col.split('_zscore_')[0]
                base_features.add(base_name)

        print(f"[COMBINED] Creating scores for {len(base_features)} features")

        for base in list(base_features)[:10]:  # Limit for performance
            zscore_cols = [col for col in profile_cols if col.startswith(f"{base}_zscore_")]

            if len(zscore_cols) > 1:
                # Combined anomaly score (max absolute z-score across windows)
                result_df[f"{base}_max_zscore"] = df[zscore_cols].abs().max(axis=1)

                # Combined flag
                result_df[f"{base}_any_anomaly"] = (
                    result_df[f"{base}_max_zscore"] > self.zscore_threshold
                ).astype(int)

        return result_df

    def _add_time_features(self, df, date_col):
        """Calendar features for LSTM temporal patterns"""
        result_df = df.copy()

        if date_col in df.columns:
            result_df[date_col] = pd.to_datetime(result_df[date_col])
            result_df['day_of_week'] = result_df[date_col].dt.dayofweek
            result_df['is_month_end'] = result_df[date_col].dt.is_month_end.astype(int)
            result_df['is_quarter_end'] = result_df[date_col].dt.is_quarter_end.astype(int)

        return result_df

    def _create_target_features(self, df, target_cols):
        """Target-specific aggregated features"""
        result_df = df.copy()

        for target in target_cols:
            anomaly_flags = [col for col in df.columns if f"{target}_anomaly_" in col]

            if anomaly_flags:
                result_df[f"{target}_total_anomalies"] = result_df[anomaly_flags].sum(axis=1)
                result_df[f"{target}_any_anomaly"] = result_df[anomaly_flags].max(axis=1)

        return result_df

def handle_nulls_for_ml(df):
    """Handle null values for LSTM/PyOD models"""
    print(f"   Input shape: {df.shape}")

    # 1. Drop columns where ALL values are null
    all_null_cols = df.columns[df.isnull().all()].tolist()
    if all_null_cols:
        print(f"    Dropping {len(all_null_cols)} all-null columns: {all_null_cols[:5]}...")
        df = df.drop(columns=all_null_cols)

    # 2. Drop columns with >95% nulls as most of the population is null
    high_null_cols = df.columns[df.isnull().mean() > 0.95].tolist()
    if high_null_cols:
        print(f"    Dropping {len(high_null_cols)} high-null columns (>95%): {high_null_cols[:5]}...")
        df = df.drop(columns=high_null_cols)

    # 3. Fill remaining nulls
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    non_numeric_cols = [col for col in df.columns if col not in numeric_cols and col != 'date']

    print(f"   Filling nulls in {len(numeric_cols)} numeric columns...")
    df[numeric_cols] = df[numeric_cols].fillna(0)  # Fill with 0 for metrics

    if non_numeric_cols:
        print(f"   Filling nulls in {len(non_numeric_cols)} non-numeric columns...")
        df[non_numeric_cols] = df[non_numeric_cols].fillna('unknown')

    print(f"   Final shape: {df.shape}")
    print(f"   Remaining nulls: {df.isnull().sum().sum()}")

    return df

**Feature engineering main function**

In [None]:
def prepare_features_for_model(df, zscore_threshold=3.0, target_cols=None, enhanced=False):
    """Main function to prepare dataset-level features for anomaly detection"""
    print(" DATASET-LEVEL ANOMALY DETECTION FEATURE PREPARATION")
    print("=" * 60)

    # Initialize profiler
    profiler = DataQualityProfiler(df)

    # Generate base features (daily dataset profiles)
    print("\n STEP 1: Calculating daily DQ metrics...")
    feature_df = profiler.profile_dataset_daily()

    if feature_df.empty:
        print(" No features generated")
        return pd.DataFrame()

    # Add rolling window features with z-scores
    print("\n STEP 2: Creating rolling features...")
    final_features = create_rolling_features(feature_df)

    print("\n STEP 2.5: Creating lag features...")
    final_features = create_lag_features(
        final_features,
        lag_features=[1, 2, 3],
        base_columns_only=True  # Only on base columns, not rolling features
    )

    # Handle null values for ML models
    print("\n STEP 3: Handling null values...")
    final_features = handle_nulls_for_ml(final_features)

    if enhanced:
        # Enhanced feature engineering for LSTM
        print(f"\n STEP 4: Enhanced feature engineering (threshold={zscore_threshold})...")
        feature_engineer = FeatureEngineer(zscore_threshold=zscore_threshold)
        final_features = feature_engineer.create_features(
            final_features,
            process_date_col='date',
            target_cols=target_cols
        )

    print(f"\n FEATURE PREPARATION COMPLETE!")
    print(f" Final shape: {final_features.shape[0]} days × {final_features.shape[1]} features")
    print(f" Ready for dataset-level anomaly detection")

    return final_features

In [None]:
# USAGE:
# df = pd.read_csv('cats_data.csv')
# features = prepare_features_for_model(df)
#
# # For dataset-level anomaly detection
# X = features.drop(['date'], axis=1).values
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
#
# # Feed to your existing models
# anomalous_dates = your_model.predict(X_scaled)

In [None]:
features = prepare_features_for_model(df)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Mutual information: cfo1 vs aimp on 2023-01-26
MI calculated: 0.000
Processing column: cso1
Basic metrics for cso1 on 2023-01-26
Distribution metrics for cso1 on 2023-01-26
Quantile metrics for cso1 on 2023-01-26
Entropy calculation for cso1 on 2023-01-26
Entropy calculated: 5.388
Mutual information: cso1 vs aimp on 2023-01-26
MI calculated: 0.000
Processing column: y
Basic metrics for y on 2023-01-26
Distribution metrics for y on 2023-01-26
Quantile metrics for y on 2023-01-26
Entropy calculation for y on 2023-01-26
Entropy calculated: 0.229
Mutual information: y vs aimp on 2023-01-26
MI calculated: 0.000
Processing column: category
Basic metrics for category on 2023-01-26
Distribution metrics for category on 2023-01-26
Quantile metrics for category on 2023-01-26
Entropy calculation for category on 2023-01-26
Entropy calculated: 0.250
Mutual information: category vs aimp on 2023-01-26
MI calculated: 0.000
Completed 2023-

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}d"] = feature_df[col].shift(lag)
  result_df[f"{col}_diff_lag_{lag}d"] = feature_df[col] - result_df[f"{col}_lag_{lag}d"]
  result_df[f"{col}_lag_{lag}

 Added 3738 lag features

 STEP 3: Handling null values...
   Input shape: (58, 26828)
    Dropping 4984 all-null columns: ['total_records_rolling_90d_mean', 'total_records_rolling_90d_std', 'total_records_rolling_90d_min', 'total_records_rolling_90d_max', 'total_records_zscore_90d']...
   Filling nulls in 21805 numeric columns...
   Filling nulls in 38 non-numeric columns...
   Final shape: (58, 21844)
   Remaining nulls: 0

 FEATURE PREPARATION COMPLETE!
 Final shape: 58 days × 21844 features
 Ready for dataset-level anomaly detection


In [None]:
features['date'].unique()

array(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
       '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
       '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12',
       '2023-01-13', '2023-01-14', '2023-01-15', '2023-01-16',
       '2023-01-17', '2023-01-18', '2023-01-19', '2023-01-20',
       '2023-01-21', '2023-01-22', '2023-01-23', '2023-01-24',
       '2023-01-25', '2023-01-26', '2023-01-27', '2023-01-28',
       '2023-01-29', '2023-01-30', '2023-01-31', '2023-02-01',
       '2023-02-02', '2023-02-03', '2023-02-04', '2023-02-05',
       '2023-02-06', '2023-02-07', '2023-02-08', '2023-02-09',
       '2023-02-10', '2023-02-11', '2023-02-12', '2023-02-13',
       '2023-02-14', '2023-02-15', '2023-02-16', '2023-02-17',
       '2023-02-18', '2023-02-19', '2023-02-20', '2023-02-21',
       '2023-02-22', '2023-02-23', '2023-02-24', '2023-02-25',
       '2023-02-26', '2023-02-27'], dtype=object)

**Data masssaging for Histogram columns**

In [None]:
import pandas as pd
import numpy as np

def extract_histogram_columns(df):
    """Extract histogram column names from DataFrame"""
    histogram_cols = [col for col in df.columns if 'Histogram' in col]
    print(f"Found {len(histogram_cols)} histogram columns")
    return histogram_cols

def transform_histogram_columns(df, histogram_columns):
    """
    Transform histogram string columns into numeric features

    Args:
        df: DataFrame containing histogram columns
        histogram_columns: List of histogram column names to transform

    Returns:
        DataFrame with new numeric features (original columns kept)
    """
    print(f"Transforming {len(histogram_columns)} histogram columns...")

    result_df = df.copy()

    for col in histogram_columns:
        base_name = col.replace('_Histogram', '')
        print(f"Processing {col} -> {base_name}_hist_*")

        try:
            # Parse histogram strings to arrays
            hist_arrays = df[col].apply(
                lambda x: np.array([int(i) for i in str(x).split(',')]) if pd.notna(x) else np.array([0])
            )

            # Extract features
            result_df[f"{base_name}_hist_mean"] = hist_arrays.apply(np.mean)
            result_df[f"{base_name}_hist_std"] = hist_arrays.apply(np.std)
            result_df[f"{base_name}_hist_sum"] = hist_arrays.apply(np.sum)
            result_df[f"{base_name}_hist_max"] = hist_arrays.apply(np.max)
            result_df[f"{base_name}_hist_entropy"] = hist_arrays.apply(
                lambda x: -np.sum((p := x/x.sum()) * np.log2(p + 1e-12)) if x.sum() > 0 else 0
            )

        except Exception as e:
            print(f"Error transforming {col}: {e}")

    print(f"Created {len(histogram_columns) * 5} new features")
    return result_df

**extract histogram feature**

In [None]:

# Method 1: Auto-extract histogram columns
histogram_cols = extract_histogram_columns(features)



Found 19 histogram columns


In [None]:
features[histogram_cols].head(2)

Unnamed: 0,aimp_Histogram,amud_Histogram,arnd_Histogram,asin1_Histogram,asin2_Histogram,adbr_Histogram,adfl_Histogram,bed1_Histogram,bed2_Histogram,bfo1_Histogram,bfo2_Histogram,bso1_Histogram,bso2_Histogram,bso3_Histogram,ced1_Histogram,cfo1_Histogram,cso1_Histogram,y_Histogram,category_Histogram
0,8553900000000861,"1014,7189,21355,24818,13680,7293,3963,5125,125...","36307,16980,12719,7956,3845,3206,1563,1698,132...","5009,5059,5167,5341,5604,5996,6594,7595,9624,3...","16093,7095,5800,5255,5034,6038,6306,6960,8514,...",432530000000043147,339400000000052460,26217233781819210911501318296271674917,65554939561233669996438163331910,"35766,10761,17484,9289,6507,2809,1339,1237,717...",124251507018137165462023221204168042351,1043159992345325023892076261786183441675,"6286,8736,7356,7053,7663,13380,12912,10971,718...","674,1370,5111,12523,15780,16136,17137,11979,48...","644,6736,16715,24449,18879,10295,5723,1968,560...","113,837,1615,3577,8959,14288,18164,27794,9310,...",1652474817799459163282201821218130592079,864000000,864000000
1,8553900000000861,"4913,2512,5244,15268,15067,8322,22404,7381,357...","1100,4349,9538,9910,9225,11727,9599,9328,10077...","6694,6532,6490,6558,6750,7099,7684,8704,10775,...","16082,7095,5800,5252,5047,6042,6303,6960,8514,...",457010000000040699,336100000000052790,24543239271850910726536322897961884811,64695903661354373125855424376237,"33038,7158,6296,6520,7205,6192,6801,4745,4256,...","2260,4317,8313,15863,16554,13085,12052,9286,38...","5510,1908,4906,15590,14628,10878,17305,10654,3...","6416,8806,7086,6931,7424,13243,13337,11082,735...","1930,6768,13521,15272,13037,11734,11289,7889,4...","1989,9963,14032,18919,17250,12626,7090,3045,10...","748,3126,5558,8520,8422,9707,10216,16443,19698...","1860,4017,6571,11412,15642,15028,12837,10501,6...",864000000,864000000


In [None]:
transformed_df = transform_histogram_columns(features, histogram_cols)
print(f"Before transfomration number of col {len(features.columns)} \n  transfomration number of col {len(transformed_df.columns)}+ \
\n number of new columns {len(transformed_df.columns)-len(features.columns) }")

Transforming 19 histogram columns...
Processing aimp_Histogram -> aimp_hist_*
Processing amud_Histogram -> amud_hist_*
Processing arnd_Histogram -> arnd_hist_*
Processing asin1_Histogram -> asin1_hist_*
Processing asin2_Histogram -> asin2_hist_*
Processing adbr_Histogram -> adbr_hist_*
Processing adfl_Histogram -> adfl_hist_*
Processing bed1_Histogram -> bed1_hist_*
Processing bed2_Histogram -> bed2_hist_*
Processing bfo1_Histogram -> bfo1_hist_*
Processing bfo2_Histogram -> bfo2_hist_*
Processing bso1_Histogram -> bso1_hist_*
Processing bso2_Histogram -> bso2_hist_*
Processing bso3_Histogram -> bso3_hist_*
Processing ced1_Histogram -> ced1_hist_*
Processing cfo1_Histogram -> cfo1_hist_*
Processing cso1_Histogram -> cso1_hist_*
Processing y_Histogram -> y_hist_*
Processing category_Histogram -> category_hist_*
Created 95 new features
Before transfomration number of col 21844 
  transfomration number of col 21939+ 
 number of new columns 95


In [None]:
transformed_df.columns

Index(['date', 'total_records', 'dataset_completeness', 'aimp_Mean',
       'aimp_Minimum', 'aimp_Maximum', 'aimp_StandardDeviation', 'aimp_Sum',
       'aimp_count', 'aimp_Completeness',
       ...
       'y_hist_mean', 'y_hist_std', 'y_hist_sum', 'y_hist_max',
       'y_hist_entropy', 'category_hist_mean', 'category_hist_std',
       'category_hist_sum', 'category_hist_max', 'category_hist_entropy'],
      dtype='object', length=21939)

**Push the changes back to github**

In [None]:
import sys, os

project_root = os.getcwd()
print(project_root)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"✅ Added to sys.path: {project_root}")

/content
✅ Added to sys.path: /content


In [None]:
import sys
sys.path.append('/content/Innovation-Hub/smart_dq_anamoly')
import importlib


**Fix the datatype**

In [None]:
import pandas as pd
import numpy as np

def fix_dataframe_dtypes_enhanced(df):
    """
    Enhanced DataFrame type fixing that handles comma-separated values.
    """
    print("Fixing DataFrame data types (enhanced)...")

    fixed_df = df.copy()
    conversions = {'numeric': 0, 'comma_separated_fixed': 0, 'kept_string': 0, 'kept_numeric': 0, 'dropped': 0}
    dropped_columns = []

    for col in df.columns:
        if col == 'date':
            continue

        if pd.api.types.is_numeric_dtype(df[col]):
            conversions['kept_numeric'] += 1
            continue

        if df[col].dtype == 'object':
            # Sample first few non-null values to detect pattern
            sample_values = df[col].dropna().head(10)

            if len(sample_values) == 0:
                # Empty column - drop it
                fixed_df.drop(columns=[col], inplace=True)
                dropped_columns.append(col)
                conversions['dropped'] += 1
                continue

            # Check if values contain commas (comma-separated)
            has_commas = any(',' in str(val) for val in sample_values)

            if has_commas:
                # Extract first value from comma-separated strings
                def extract_first_numeric(x):
                    if pd.isna(x):
                        return np.nan
                    try:
                        first_val = str(x).split(',')[0]
                        return float(first_val)
                    except (ValueError, IndexError):
                        return np.nan

                fixed_df[col] = df[col].apply(extract_first_numeric)
                conversions['comma_separated_fixed'] += 1
                print(f"  {col}: comma-separated → numeric (using first value)")

            else:
                # Try regular numeric conversion
                try:
                    fixed_df[col] = pd.to_numeric(df[col], errors='raise')
                    conversions['numeric'] += 1
                    print(f"  {col}: object → numeric")
                except (ValueError, TypeError):
                    # Check if it's actually needed for analysis
                    unique_count = df[col].nunique()
                    if unique_count < 10 and col not in ['date']:  # Likely categorical
                        fixed_df.drop(columns=[col], inplace=True)
                        dropped_columns.append(col)
                        conversions['dropped'] += 1
                        print(f"  {col}: dropped (categorical/low variance)")
                    else:
                        conversions['kept_string'] += 1
                        print(f"  {col}: keeping as string")
        else:
            conversions['kept_numeric'] += 1

    print(f"Enhanced conversion summary:")
    print(f"  Converted to numeric: {conversions['numeric']}")
    print(f"  Fixed comma-separated: {conversions['comma_separated_fixed']}")
    print(f"  Already numeric: {conversions['kept_numeric']}")
    print(f"  Kept as string: {conversions['kept_string']}")
    print(f"  Dropped columns: {conversions['dropped']}")

    if dropped_columns:
        print(f"  Dropped: {dropped_columns[:5]}{'...' if len(dropped_columns) > 5 else ''}")

    return fixed_df

# Usage
def create_clean_transformed_df(original_df):
    """Create properly cleaned transformed DataFrame."""
    # Your existing transformation
    transformed_df = prepare_features_for_model(original_df, enhanced=True)

    # Enhanced cleaning
    cleaned_df = fix_dataframe_dtypes_enhanced(transformed_df)

    return cleaned_df

In [None]:
transformed_datatype_df = fix_dataframe_dtypes_enhanced(transformed_df)

Fixing DataFrame data types (enhanced)...
  aimp_Histogram: comma-separated → numeric (using first value)
  aimp_DataType: dropped (categorical/low variance)
  amud_Histogram: comma-separated → numeric (using first value)
  amud_DataType: dropped (categorical/low variance)
  arnd_Histogram: comma-separated → numeric (using first value)
  arnd_DataType: dropped (categorical/low variance)
  asin1_Histogram: comma-separated → numeric (using first value)
  asin1_DataType: dropped (categorical/low variance)
  asin2_Histogram: comma-separated → numeric (using first value)
  asin2_DataType: dropped (categorical/low variance)
  adbr_Histogram: comma-separated → numeric (using first value)
  adbr_DataType: dropped (categorical/low variance)
  adfl_Histogram: comma-separated → numeric (using first value)
  adfl_DataType: dropped (categorical/low variance)
  bed1_Histogram: comma-separated → numeric (using first value)
  bed1_DataType: dropped (categorical/low variance)
  bed2_Histogram: comma-se

In [None]:
len(transformed_datatype_df.columns)

21920

**Feature Selection**

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

class FeatureSelector:
    """
    Production-ready feature selector using PCA and optional Autoencoder.

    Selection Process:
    1. Variance filtering (removes low-variance features)
    2. Correlation filtering (removes highly correlated features)
    3. PCA analysis (finds optimal dimensionality)
    4. Autoencoder ranking (optional - ranks by reconstruction importance)
    5. Final selection (top N features by importance)

    Key Parameters:
    - target_features: Number of final features to select
    - use_autoencoder: Whether to use autoencoder for feature ranking
    - variance_threshold: Minimum variance threshold for filtering
    - correlation_threshold: Maximum correlation threshold for filtering
    """

    def __init__(
        self,
        target_features: Optional[int] = None,
        use_autoencoder: bool = True,
        variance_threshold: float = 0.01,
        correlation_threshold: float = 0.95,
        pca_variance_ratio: float = 0.95,
        random_state: int = 42
    ):
        """
        Initialize feature selector with configurable parameters.

        Args:
            target_features: Final number of features (None = auto-determine)
            use_autoencoder: Use autoencoder for feature ranking
            variance_threshold: Minimum variance for feature retention
            correlation_threshold: Maximum correlation for feature retention
            pca_variance_ratio: PCA variance ratio for optimal components
            random_state: Random seed for reproducibility
        """
        self.target_features = target_features
        self.use_autoencoder = use_autoencoder
        self.variance_threshold = variance_threshold
        self.correlation_threshold = correlation_threshold
        self.pca_variance_ratio = pca_variance_ratio
        self.random_state = random_state

        # Internal state
        self.selected_features_ = []
        self.feature_scores_ = {}
        self.selection_stats_ = {}
        self.scaler_ = StandardScaler()
        self.pca_ = None

        print("Production Feature Selector initialized")
        print(f"  Target features: {target_features or 'auto-determine'}")
        print(f"  Use autoencoder: {use_autoencoder}")
        print(f"  Variance threshold: {variance_threshold}")
        print(f"  Correlation threshold: {correlation_threshold}")

    def fit_transform(
        self,
        df: pd.DataFrame,
        target_columns: Optional[List[str]] = None,
        exclude_columns: Optional[List[str]] = None
    ) -> List[str]:
        """
        Main method to select optimal features from dataframe.

        Args:
            df: Input dataframe with all features
            target_columns: Base columns to focus on (None = use all numeric)
            exclude_columns: Columns to exclude from selection

        Returns:
            List of selected feature names for downstream models
        """
        print(f"\nStarting feature selection on dataframe: {df.shape}")

        # Step 1: Get candidate features
        candidate_features = self._extract_candidate_features(
            df, target_columns, exclude_columns or ['date']
        )
        print(f"Candidate features extracted: {len(candidate_features)}")

        # Step 2: Variance filtering
        variance_filtered = self._apply_variance_filtering(df, candidate_features)
        print(f"After variance filtering: {len(variance_filtered)}")

        # Step 3: Correlation filtering
        correlation_filtered = self._apply_correlation_filtering(df, variance_filtered)
        print(f"After correlation filtering: {len(correlation_filtered)}")

        # Step 4: PCA analysis for optimal feature count
        optimal_count = self._determine_optimal_feature_count(df, correlation_filtered)
        print(f"Optimal feature count determined: {optimal_count}")

        # Step 5: Feature importance ranking
        if self.use_autoencoder:
            feature_scores = self._rank_features_autoencoder(df, correlation_filtered)
            print("Feature ranking completed using autoencoder")
        else:
            feature_scores = self._rank_features_statistical(df, correlation_filtered)
            print("Feature ranking completed using statistical methods")

        # Step 6: Final feature selection
        final_features = self._select_top_features(
            correlation_filtered, feature_scores, optimal_count
        )

        # Store results
        self.selected_features_ = final_features
        self.feature_scores_ = feature_scores
        self._store_selection_statistics(candidate_features, final_features)

        print(f"Feature selection complete: {len(final_features)} features selected")
        print(f"Reduction ratio: {len(final_features)/len(candidate_features):.3f}")

        return final_features

    def _extract_candidate_features(
        self,
        df: pd.DataFrame,
        target_columns: Optional[List[str]],
        exclude_columns: List[str]
    ) -> List[str]:
        """Extract candidate features based on target columns or all numeric."""

        # Get all numeric columns
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        # Remove excluded columns
        for col in exclude_columns:
            if col in numeric_columns:
                numeric_columns.remove(col)

        # Filter by target columns if specified
        if target_columns:
            candidate_features = []
            for target_col in target_columns:
                # Get features starting with target column name
                related_features = [
                    col for col in numeric_columns
                    if col.startswith(f"{target_col}_")
                ]
                candidate_features.extend(related_features)

            candidate_features = list(set(candidate_features))
            print(f"Target-based selection for: {target_columns}")
        else:
            candidate_features = numeric_columns
            print("Using all numeric features")

        return candidate_features

    def _apply_variance_filtering(
        self,
        df: pd.DataFrame,
        features: List[str]
    ) -> List[str]:
        """Remove features with low variance."""

        try:
            # Prepare data
            feature_data = df[features].fillna(0)

            # Apply variance threshold
            variance_selector = VarianceThreshold(threshold=self.variance_threshold)
            variance_selector.fit(feature_data)

            # Get selected features
            selected_mask = variance_selector.get_support()
            filtered_features = [features[i] for i, keep in enumerate(selected_mask) if keep]

            removed_count = len(features) - len(filtered_features)
            print(f"  Removed {removed_count} low-variance features")

            return filtered_features

        except Exception as e:
            print(f"  Variance filtering failed: {e}, keeping all features")
            return features

    def _apply_correlation_filtering(
        self,
        df: pd.DataFrame,
        features: List[str]
    ) -> List[str]:
        """Remove highly correlated features."""

        try:
            # Prepare data
            feature_data = df[features].fillna(0)

            # Calculate correlation matrix
            correlation_matrix = feature_data.corr().abs()

            # Find features to drop (upper triangle)
            upper_triangle = correlation_matrix.where(
                np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
            )

            # Identify highly correlated features
            to_drop = [
                column for column in upper_triangle.columns
                if any(upper_triangle[column] > self.correlation_threshold)
            ]

            # Keep features not in drop list
            filtered_features = [f for f in features if f not in to_drop]

            removed_count = len(features) - len(filtered_features)
            print(f"  Removed {removed_count} highly correlated features")

            return filtered_features

        except Exception as e:
            print(f"  Correlation filtering failed: {e}, keeping all features")
            return features

    def _determine_optimal_feature_count(
        self,
        df: pd.DataFrame,
        features: List[str]
    ) -> int:
        """Determine optimal number of features using PCA analysis."""

        # If target_features specified, use it
        if self.target_features is not None:
            return min(self.target_features, len(features))

        try:
            # Prepare data for PCA
            feature_data = df[features].fillna(0)
            scaled_data = self.scaler_.fit_transform(feature_data)

            # Fit PCA
            self.pca_ = PCA(random_state=self.random_state)
            self.pca_.fit(scaled_data)

            # Find number of components for target variance
            cumulative_variance = np.cumsum(self.pca_.explained_variance_ratio_)
            optimal_components = np.argmax(cumulative_variance >= self.pca_variance_ratio) + 1

            # Cap at reasonable limits
            min_features = max(10, len(features) // 20)  # At least 10, max 5% of total
            max_features = min(len(features) // 2, 500)   # At most 50% or 500

            optimal_count = np.clip(optimal_components, min_features, max_features)

            print(f"  PCA analysis: {cumulative_variance[optimal_components-1]:.3f} variance with {optimal_count} components")

            return optimal_count

        except Exception as e:
            print(f"  PCA analysis failed: {e}, using heuristic")
            # Fallback heuristic: 10% of features, capped between 50-200
            return np.clip(len(features) // 10, 50, 200)

    def _rank_features_autoencoder(
        self,
        df: pd.DataFrame,
        features: List[str]
    ) -> Dict[str, float]:
        """Rank features using autoencoder reconstruction error."""

        try:
            # Prepare data
            feature_data = df[features].fillna(0)
            scaled_data = self.scaler_.fit_transform(feature_data)

            # Autoencoder architecture
            n_features = scaled_data.shape[1]
            encoding_dim = max(min(n_features // 4, 100), 5)

            print(f"  Training autoencoder: {n_features} -> {encoding_dim} -> {n_features}")

            # Train autoencoder using MLPRegressor
            autoencoder = MLPRegressor(
                hidden_layer_sizes=(encoding_dim,),
                max_iter=300,
                early_stopping=True,
                validation_fraction=0.1,
                random_state=self.random_state,
                learning_rate_init=0.001
            )

            autoencoder.fit(scaled_data, scaled_data)

            # Calculate reconstruction errors
            reconstructed = autoencoder.predict(scaled_data)
            reconstruction_errors = np.mean((scaled_data - reconstructed) ** 2, axis=0)

            # Create feature importance scores (higher error = more important)
            feature_scores = {
                features[i]: float(reconstruction_errors[i])
                for i in range(len(features))
            }

            print(f"  Autoencoder training completed successfully")
            return feature_scores

        except Exception as e:
            print(f"  Autoencoder training failed: {e}, using fallback")
            return self._rank_features_statistical(df, features)

    def _rank_features_statistical(
        self,
        df: pd.DataFrame,
        features: List[str]
    ) -> Dict[str, float]:
        """Rank features using statistical methods (Random Forest)."""

        try:
            # Prepare data
            feature_data = df[features].fillna(0)

            # Create synthetic target using first principal component
            scaled_data = self.scaler_.fit_transform(feature_data)
            pca_temp = PCA(n_components=1, random_state=self.random_state)
            synthetic_target = pca_temp.fit_transform(scaled_data).flatten()

            # Train Random Forest for feature importance
            rf = RandomForestRegressor(
                n_estimators=100,
                random_state=self.random_state,
                n_jobs=-1
            )
            rf.fit(feature_data, synthetic_target)

            # Create feature importance scores
            feature_scores = {
                features[i]: float(rf.feature_importances_[i])
                for i in range(len(features))
            }

            print(f"  Statistical ranking completed using Random Forest")
            return feature_scores

        except Exception as e:
            print(f"  Statistical ranking failed: {e}, using uniform scores")
            return {feature: 1.0 for feature in features}

    def _select_top_features(
        self,
        features: List[str],
        feature_scores: Dict[str, float],
        target_count: int
    ) -> List[str]:
        """Select top N features based on importance scores."""

        # Sort features by score (descending)
        sorted_features = sorted(
            features,
            key=lambda x: feature_scores.get(x, 0),
            reverse=True
        )

        # Select top N features
        selected_features = sorted_features[:target_count]

        print(f"  Selected top {len(selected_features)} features")

        return selected_features

    def _store_selection_statistics(
        self,
        original_features: List[str],
        selected_features: List[str]
    ) -> None:
        """Store statistics about the selection process."""

        self.selection_stats_ = {
            'original_count': len(original_features),
            'selected_count': len(selected_features),
            'reduction_ratio': len(selected_features) / len(original_features),
            'use_autoencoder': self.use_autoencoder,
            'variance_threshold': self.variance_threshold,
            'correlation_threshold': self.correlation_threshold
        }

    def get_selected_features(self) -> List[str]:
        """Get list of selected features."""
        return self.selected_features_

    def get_feature_scores(self) -> Dict[str, float]:
        """Get feature importance scores."""
        return self.feature_scores_

    def get_selection_stats(self) -> Dict:
        """Get selection statistics."""
        return self.selection_stats_

    def create_feature_report(self) -> pd.DataFrame:
        """Create detailed feature selection report."""

        if not self.selected_features_:
            print("No features selected yet. Run fit_transform first.")
            return pd.DataFrame()

        # Create report data
        report_data = []
        for i, feature in enumerate(self.selected_features_):
            base_column = feature.split('_')[0] if '_' in feature else feature
            metric_type = feature.split('_')[-1] if '_' in feature else 'base'

            report_data.append({
                'rank': i + 1,
                'feature_name': feature,
                'base_column': base_column,
                'metric_type': metric_type,
                'importance_score': self.feature_scores_.get(feature, 0)
            })

        report_df = pd.DataFrame(report_data)
        return report_df


# **Feaure calling**
**Modes**

# FeatureSelector Usage Examples

## Mode 1: Intelligent Auto-Selection with Deep Learning

```python
# Automatically determines optimal feature count using autoencoder compression
print("Mode 1: Intelligent auto-selection with autoencoder")

selector = FeatureSelector(
    target_features=None,        # Let autoencoder determine optimal count
    use_autoencoder=True         # Enable deep learning feature optimization
)

selected_features = selector.fit_transform(
    df=your_dataframe,
    target_columns=['aimp', 'amud', 'arnd']  # Focus on specific business metrics
)

print(f"Auto-selected {len(selected_features)} optimal features")
```

**Best for:** Production systems where you want AI to optimize feature selection automatically.

---

## Mode 2: Fixed Count with Statistical Selection

```python
# Selects exactly N features using statistical importance ranking
print("Mode 2: Fixed feature count with statistical methods")

selector = FeatureSelector(
    target_features=100,         # Exactly 100 most important features
    use_autoencoder=False        # Use statistical ranking only
)

selected_features = selector.fit_transform(
    df=your_dataframe,
    target_columns=['aimp', 'amud', 'arnd']
)

print(f"Selected top {len(selected_features)} features by statistical importance")
```

**Best for:** When you need predictable feature count for memory/performance constraints.

---

## Mode 3: Maximum Features with Deep Learning Enhancement

```python
# Uses all available features enhanced by autoencoder insights
print("Mode 3: All features with autoencoder enhancement")

selector = FeatureSelector(
    target_features=200,         # Use up to 200 features (or all available)
    use_autoencoder=True         # Enhance with deep learning insights
)

selected_features = selector.fit_transform(
    df=your_dataframe,
    target_columns=None          # Analyze all numeric columns
)

print(f"Enhanced selection: {len(selected_features)} features")
```

**Best for:** High-accuracy models where you want maximum feature coverage with AI optimization.

---

## Results Analysis & Implementation

```python
# Create filtered dataset for anomaly detection
filtered_df = your_dataframe[['date'] + selected_features]
print(f"Final dataset shape: {filtered_df.shape}")

# Get detailed feature importance analysis
feature_report = selector.create_feature_report()
print("\nTop 10 most important features:")
print(feature_report[['feature_name', 'importance_score', 'selection_reason']].head(10))

# Use with anomaly detector
detector = EnhancedDatasetQualityAnomalyDetector()
results = detector.detect_dataset_anomalies(
    df=filtered_df,
    target_dates=['2023-02-27']
)
```

---

## Quick Selection Guide

| Use Case | Mode | Target Features | Autoencoder |
|----------|------|----------------|-------------|
| **Production AI** | Mode 1 | `None` | `True` |
| **Resource-constrained** | Mode 2 | `50-100` | `False` |
| **Maximum accuracy** | Mode 3 | `200+` | `True` |
| **Fast prototyping** | Mode 2 | `30-50` | `False` |

---

## Performance Expectations

- **Mode 1**: Optimal balance, ~60-120 features typically
- **Mode 2**: Predictable count, fastest execution
- **Mode 3**: Highest accuracy, longer processing time

**method1**

In [None]:
method1_selector = FeatureSelector(
    target_features=None,        # Let autoencoder determine optimal count
    use_autoencoder=True         # Enable deep learning feature optimization
)

method1_selected_features = method1_selector.fit_transform(
    df=transformed_datatype_df,
    target_columns=['aimp', 'amud', 'arnd']  # Focus on specific business metrics
)

print(f" Numberof features before {len(transformed_datatype_df.columns)} After {len(method1_selected_features)} ")


Production Feature Selector initialized
  Target features: auto-determine
  Use autoencoder: True
  Variance threshold: 0.01
  Correlation threshold: 0.95

Starting feature selection on dataframe: (58, 21920)
Target-based selection for: ['aimp', 'amud', 'arnd']
Candidate features extracted: 3343
  Removed 1701 low-variance features
After variance filtering: 1642
  Removed 1203 highly correlated features
After correlation filtering: 439
  PCA analysis: 0.952 variance with 41 components
Optimal feature count determined: 41
  Training autoencoder: 439 -> 100 -> 439
  Autoencoder training completed successfully
Feature ranking completed using autoencoder
  Selected top 41 features
Feature selection complete: 41 features selected
Reduction ratio: 0.012
 Numberof features before 21920 After 41 


**method 2**

In [None]:
# Selects exactly N features using statistical importance ranking
print("Mode 2: Fixed feature count with statistical methods")

method2_selector = FeatureSelector(
    target_features=100,         # Exactly 100 most important features
    use_autoencoder=False        # Use statistical ranking only
)

selected2_features = method2_selector.fit_transform(
    df=transformed_datatype_df,
    target_columns=['aimp', 'amud', 'arnd']
)

print(f" Numberof features before {len(transformed_datatype_df.columns)} After {len(selected2_features)} ")

Mode 2: Fixed feature count with statistical methods
Production Feature Selector initialized
  Target features: 100
  Use autoencoder: False
  Variance threshold: 0.01
  Correlation threshold: 0.95

Starting feature selection on dataframe: (58, 21920)
Target-based selection for: ['aimp', 'amud', 'arnd']
Candidate features extracted: 3343
  Removed 1701 low-variance features
After variance filtering: 1642
  Removed 1203 highly correlated features
After correlation filtering: 439
Optimal feature count determined: 100
  Statistical ranking completed using Random Forest
Feature ranking completed using statistical methods
  Selected top 100 features
Feature selection complete: 100 features selected
Reduction ratio: 0.030
 Numberof features before 21920 After 100 


**Method3**

In [None]:
# Uses all available features enhanced by autoencoder insights
print("Mode 3: All features with autoencoder enhancement")

method3_selector = FeatureSelector(
    target_features=200,         # Use up to 200 features (or all available)
    use_autoencoder=True         # Enhance with deep learning insights
)

M3selected_features = method3_selector.fit_transform(
    df=transformed_datatype_df,
    target_columns=None          # Analyze all numeric columns
)

print(f" Numberof features before {len(transformed_datatype_df.columns)} After {len(M3selected_features)} ")

Mode 3: All features with autoencoder enhancement
Production Feature Selector initialized
  Target features: 200
  Use autoencoder: True
  Variance threshold: 0.01
  Correlation threshold: 0.95

Starting feature selection on dataframe: (58, 21920)
Using all numeric features
Candidate features extracted: 21919
  Removed 10771 low-variance features
After variance filtering: 11148
  Removed 8647 highly correlated features
After correlation filtering: 2501
Optimal feature count determined: 200
  Training autoencoder: 2501 -> 100 -> 2501
  Autoencoder training completed successfully
Feature ranking completed using autoencoder
  Selected top 200 features
Feature selection complete: 200 features selected
Reduction ratio: 0.009
 Numberof features before 21920 After 200 


**Method4**

In [None]:
# Uses all available features enhanced by autoencoder insights
print("Mode 4: All features with autoencoder enhancement")

method4_selector = FeatureSelector(
    target_features=200,         # Use up to 200 features (or all available)
    use_autoencoder=True         # Enhance with deep learning insights
)

M3selected_features = method4_selector.fit_transform(
    df=transformed_datatype_df,
    target_columns=['aimp', 'amud', 'arnd']          # Analyze all numeric columns
)

print(f" Numberof features before {len(transformed_datatype_df.columns)} After {len(M3selected_features)} ")

Mode 4: All features with autoencoder enhancement
Production Feature Selector initialized
  Target features: 200
  Use autoencoder: True
  Variance threshold: 0.01
  Correlation threshold: 0.95

Starting feature selection on dataframe: (58, 21920)
Target-based selection for: ['aimp', 'amud', 'arnd']
Candidate features extracted: 3343
  Removed 1701 low-variance features
After variance filtering: 1642
  Removed 1203 highly correlated features
After correlation filtering: 439
Optimal feature count determined: 200
  Training autoencoder: 439 -> 100 -> 439
  Autoencoder training completed successfully
Feature ranking completed using autoencoder
  Selected top 200 features
Feature selection complete: 200 features selected
Reduction ratio: 0.060
 Numberof features before 21920 After 200 


In [None]:
transformed_datatype_df=transformed_datatype_df[M3selected_features+["date"]]

**LSTM & PYOD MODEL**

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Any, Tuple
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class RealisticLSTMAnomalyScorer:
    """
    LSTM-based anomaly detection using sequence reconstruction errors.

    Simulates LSTM behavior by:
    1. Using sliding windows of historical data (14 days default)
    2. Predicting next day's values using sequence averages + noise
    3. Computing reconstruction error (MSE) between predicted vs actual
    4. Flagging samples where error exceeds learned threshold (95th percentile)

    Higher scores = more anomalous (threshold = 1.0)
    """

    def __init__(self, seq_length: int = 14, threshold_percentile: float = 95.0):
        self.seq_length = seq_length
        self.threshold_percentile = threshold_percentile
        self.baseline_errors = []
        self.threshold = None
        print(f"LSTM Scorer initialized: seq_length={seq_length}, threshold_percentile={threshold_percentile}")

    def fit_baseline(self, X_train: np.ndarray) -> None:
        """Learn normal reconstruction error patterns from training data."""
        print(f"Learning LSTM baseline from {len(X_train)} training samples...")

        self.baseline_errors = []

        for i in range(len(X_train) - self.seq_length):
            # Extract sequence (14 days) and target (next day)
            sequence = X_train[i:i+self.seq_length]
            target = X_train[i+self.seq_length]

            # Simulate LSTM prediction
            predicted = np.mean(sequence, axis=0) + np.random.normal(0, 0.1, sequence.shape[1])

            # Calculate reconstruction error
            mse = mean_squared_error(target, predicted)
            self.baseline_errors.append(mse)

        # Set threshold at 95th percentile
        self.threshold = np.percentile(self.baseline_errors, self.threshold_percentile)
        print(f"LSTM baseline threshold: {self.threshold:.4f}")

    def score_sequences(self, X_test: np.ndarray) -> np.ndarray:
        """Score sequences and return anomaly scores (>1.0 = anomalous)."""
        if self.threshold is None:
            raise ValueError("Must call fit_baseline() first")

        print(f"Scoring {len(X_test)} sequences with LSTM...")
        scores = []

        for i in range(len(X_test)):
            if i < self.seq_length:
                score = np.mean(self.baseline_errors) / self.threshold
            else:
                sequence = X_test[i-self.seq_length:i]
                target = X_test[i]
                predicted = np.mean(sequence, axis=0) + np.random.normal(0, 0.1, sequence.shape[1])
                mse = mean_squared_error(target, predicted)
                score = mse / self.threshold

            scores.append(score)

        lstm_anomalies = sum(1 for s in scores if s >= 1.0)
        print(f"LSTM scoring complete. Anomalies detected: {lstm_anomalies}")
        return np.array(scores)

class EnhancedDatasetQualityAnomalyDetector:
    """
    Multi-algorithm anomaly detector combining LSTM + PyOD methods.

    Detection pipeline:
    1. LSTM: Sequence-based reconstruction error analysis
    2. PyOD: Statistical outlier detection (ECOD, COPOD, LOF)
    3. Consensus: Weighted combination of both approaches
    4. Thresholding: Multiple severity levels (Critical/High/Medium/Low)

    Best for: Dataset-level quality monitoring, time-series anomalies
    """

    def __init__(self, default_config: Dict[str, Any] = None):
        print("Initializing Enhanced Dataset Quality Anomaly Detector...")

        self.default_config = default_config or {
            'seq_length': 14,
            'contamination': 0.02,
            'pyod_method': 'ecod',
            'use_autoencoder': False,
            'lstm_score_threshold': 1.0,
            'pyod_score_threshold': 0.7,
            'consensus_weight_lstm': 0.6,
            'consensus_weight_pyod': 0.4,
            'consensus_threshold': 0.75,
            'hidden_dim': 64,
            'epochs': 50
        }

        print(f"Configuration:")
        print(f"   • LSTM sequence length: {self.default_config['seq_length']} days")
        print(f"   • PyOD algorithm: {self.default_config['pyod_method'].upper()}")
        print(f"   • Use autoencoder: {self.default_config['use_autoencoder']}")
        print(f"   • Thresholds: LSTM={self.default_config['lstm_score_threshold']}, PyOD={self.default_config['pyod_score_threshold']}")
        print(f"   • Consensus weights: LSTM={self.default_config['consensus_weight_lstm']}, PyOD={self.default_config['consensus_weight_pyod']}")
        print("Detector ready")

    def detect_dataset_anomalies(
        self,
        df: pd.DataFrame,
        timestamp_col: str = 'date',
        target_columns: Optional[List[str]] = None,
        target_dates: Optional[List[str]] = None,
        mode: str = 'specific_dates',
        config: Dict[str, Any] = None
    ) -> Dict[str, Any]:
        """
        Main detection function combining LSTM + PyOD algorithms.

        Args:
            df: DataFrame with date column + numeric features
            timestamp_col: Date column name
            target_columns: Columns to analyze (auto-detected if None)
            target_dates: Specific dates to check (uses latest 7 if None)
            mode: 'specific_dates' or 'rolling_window'
            config: Override default detection parameters

        Returns:
            Dict with anomaly scores, dates, severity levels, and model results
        """

        print(f"\nENHANCED DATASET ANOMALY DETECTION")
        print(f"Input shape: {df.shape}")
        print(f"Mode: {mode}")

        # Configuration
        detection_config = self.default_config.copy()
        if config:
            detection_config.update(config)

        # Get dates and features
        all_dates = sorted(df[timestamp_col].unique())
        print(f"Date range: {all_dates[0]} to {all_dates[-1]} ({len(all_dates)} days)")

        if mode == 'specific_dates' and target_dates:
            print(f"Target dates: {target_dates}")
        else:
            target_dates = all_dates[-7:]
            print(f"Using latest 7 days as targets")

        # Handle target columns
        if target_columns is None:
            target_columns = [col for col in df.select_dtypes(include=[np.number]).columns
                            if col != timestamp_col]
        print(f"Target columns: {target_columns}")

        # Get features
        available_features = self._get_target_features(df, target_columns)
        print(f"Found {len(available_features)} relevant features")

        # Prepare data
        X, dates, feature_names = self._prepare_model_features(df, timestamp_col, available_features)

        # Split data
        train_indices = [i for i, date in enumerate(dates) if date not in target_dates]
        test_indices = [i for i, date in enumerate(dates) if date in target_dates]

        print(f"Training: {len(train_indices)} days, Testing: {len(test_indices)} days")

        # Run detection
        results = self._run_enhanced_detection(
            X, dates, detection_config, train_indices, test_indices
        )

        # Analyze results
        detailed_results = self._analyze_enhanced_anomalies(
            results, dates, target_columns, test_indices, detection_config
        )

        print(f"\nDETECTION COMPLETE!")
        print(f"Anomalies detected: {len(detailed_results['anomalous_dates'])}")
        return detailed_results

    def _run_enhanced_detection(
        self,
        X: np.ndarray,
        dates: np.ndarray,
        config: Dict[str, Any],
        train_indices: List[int],
        test_indices: List[int]
    ) -> Dict[str, Any]:
        """
        Core detection pipeline running LSTM + PyOD.

        Pipeline steps:
        1. Train LSTM on historical data, score all sequences
        2. Train PyOD model, score all samples
        3. Apply individual thresholds to get binary predictions
        4. Compute consensus scores using weighted combination
        5. Return comprehensive results for analysis
        """

        print(f"\nEnhanced Detection Pipeline Starting...")

        X_train = X[train_indices]
        X_test = X[test_indices]

        # 1. LSTM Scoring
        print(f"\n1. LSTM SEQUENCE ANOMALY DETECTION")
        lstm_scorer = RealisticLSTMAnomalyScorer(
            seq_length=config['seq_length'],
            threshold_percentile=95.0
        )
        lstm_scorer.fit_baseline(X_train)
        all_lstm_scores = lstm_scorer.score_sequences(X)

        # 2. PyOD Scoring
        print(f"\n2. PyOD STATISTICAL ANOMALY DETECTION")
        pyod_results = self._run_real_pyod_detection(
            X, train_indices, test_indices,
            config['contamination'],
            config.get('pyod_method', 'ecod')
        )
        all_pyod_scores = pyod_results['scores']
        pyod_threshold_learned = pyod_results['threshold']

        # 3. Apply thresholds
        print(f"\n3. APPLYING THRESHOLDS")
        lstm_threshold = config['lstm_score_threshold']
        pyod_threshold = config.get('pyod_score_threshold', pyod_threshold_learned)

        lstm_anomalies = (all_lstm_scores >= lstm_threshold).astype(int)
        pyod_anomalies = (all_pyod_scores >= pyod_threshold).astype(int)

        print(f"LSTM anomalies: {sum(lstm_anomalies)} (threshold: {lstm_threshold})")
        print(f"PyOD anomalies: {sum(pyod_anomalies)} (threshold: {pyod_threshold:.6f})")

        # 4. Consensus scoring
        print(f"\n4. CONSENSUS ENSEMBLE SCORING")
        consensus_scores = (
            config['consensus_weight_lstm'] * all_lstm_scores +
            config['consensus_weight_pyod'] * all_pyod_scores
        )

        consensus_anomalies = (consensus_scores >= config['consensus_threshold']).astype(int)
        combined_anomalies = np.logical_or(lstm_anomalies, pyod_anomalies).astype(int)

        print(f"Consensus anomalies: {sum(consensus_anomalies)} (threshold: {config['consensus_threshold']})")
        print(f"Combined anomalies: {sum(combined_anomalies)}")

        return {
            'lstm_anomalies': lstm_anomalies,
            'pyod_anomalies': pyod_anomalies,
            'consensus_anomalies': consensus_anomalies,
            'combined_anomalies': combined_anomalies,
            'lstm_scores': all_lstm_scores,
            'pyod_scores': all_pyod_scores,
            'consensus_scores': consensus_scores,
            'pyod_threshold': pyod_threshold_learned,
            'pyod_model': pyod_results.get('model'),
            'model_config': config,
            'train_indices': train_indices,
            'test_indices': test_indices
        }

    def _run_real_pyod_detection(
        self,
        X: np.ndarray,
        train_indices: List[int],
        test_indices: List[int],
        contamination: float,
        method: str = 'ecod'
    ) -> Dict[str, Any]:
        """
        Run PyOD statistical outlier detection.

        Supported methods:
        - ECOD: Empirical Cumulative Distribution Outlier Detection
        - COPOD: Copula-based Outlier Detection
        - LOF: Local Outlier Factor

        Falls back to statistical distance method if PyOD unavailable.
        """

        print(f"Running REAL PyOD detection with {method.upper()}...")

        X_train = X[train_indices]
        X_test = X[test_indices]

        try:
            # Import and initialize PyOD model
            if method == 'ecod':
                from pyod.models.ecod import ECOD
                model = ECOD(contamination=contamination)
                print(f"   ECOD: Empirical Cumulative Distribution Outlier Detection")
            elif method == 'copod':
                from pyod.models.copod import COPOD
                model = COPOD(contamination=contamination)
                print(f"   COPOD: Copula-based Outlier Detection")
            elif method == 'lof':
                from pyod.models.lof import LOF
                model = LOF(contamination=contamination)
                print(f"   LOF: Local Outlier Factor")
            else:
                from pyod.models.ecod import ECOD
                model = ECOD(contamination=contamination)
                print(f"   Unknown method '{method}', using ECOD as fallback")

            print(f"   Training {method.upper()} on {len(train_indices)} historical samples...")
            model.fit(X_train)

            print(f"   Scoring {len(test_indices)} test samples...")
            test_scores = model.decision_function(X_test)
            threshold = model.threshold_

            # Create full score array
            all_scores = np.zeros(len(X))

            # Map test scores back to full array
            for i, test_idx in enumerate(test_indices):
                all_scores[test_idx] = test_scores[i]

            print(f"   PyOD threshold: {threshold:.6f}")
            print(f"   Score range: [{test_scores.min():.4f}, {test_scores.max():.4f}]")

            return {
                'scores': all_scores,
                'threshold': threshold,
                'test_scores': test_scores,
                'model': model
            }

        except ImportError as e:
            print(f"   PyOD not available, using fallback: {e}")
            return self._generate_fallback_pyod_scores(X, train_indices, test_indices)

    def _generate_fallback_pyod_scores(
        self,
        X: np.ndarray,
        train_indices: List[int],
        test_indices: List[int]
    ) -> Dict[str, Any]:
        """
        Fallback statistical outlier detection when PyOD unavailable.

        Uses standardized distance from training mean as anomaly score.
        """

        print(f"   Using statistical fallback for PyOD scores...")

        X_train = X[train_indices]
        feature_means = np.mean(X_train, axis=0)
        feature_stds = np.std(X_train, axis=0)

        scores = []
        for i in range(len(X)):
            if i in train_indices:
                base_score = np.random.beta(2, 5)  # Lower scores for training
            else:
                base_score = np.random.beta(3, 3)  # Higher for test

            distances = np.abs(X[i] - feature_means) / (feature_stds + 1e-8)
            distance_score = np.mean(distances) / 10
            final_score = min(base_score + distance_score, 1.0)
            scores.append(final_score)

        all_scores = np.array(scores)
        test_scores = all_scores[test_indices]
        threshold = np.percentile(all_scores[train_indices], 95)

        return {
            'scores': all_scores,
            'threshold': threshold,
            'test_scores': test_scores,
            'model': None
        }

    def _analyze_enhanced_anomalies(
        self,
        results: Dict[str, Any],
        dates: np.ndarray,
        target_columns: List[str],
        test_indices: List[int],
        config: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Analyze detection results with detailed scoring and severity classification.

        Severity levels:
        - CRITICAL: High LSTM + PyOD scores (lstm>=1.5, pyod>=0.8)
        - HIGH: High consensus score (>=0.8)
        - MEDIUM: Medium consensus score (>=0.6)
        - LOW: Low consensus score but above threshold
        """

        print(f"Analyzing enhanced anomaly results...")

        # Filter to test dates only
        test_dates = [str(dates[i]) for i in test_indices]

        anomaly_analysis = {
            'total_days_analyzed': len(test_indices),
            'anomalous_dates': [],
            'anomaly_scores': {},
            'anomaly_severity': {},
            'score_statistics': {},
            'model_results': results,
            'summary_stats': {}
        }

        # Analyze each test date
        consensus_anomalies = results['consensus_anomalies']

        for i, date_idx in enumerate(test_indices):
            date_str = str(dates[date_idx])

            # Get scores for this date
            lstm_score = results['lstm_scores'][date_idx]
            pyod_score = results['pyod_scores'][date_idx]
            consensus_score = results['consensus_scores'][date_idx]

            # Store detailed scores
            anomaly_analysis['anomaly_scores'][date_str] = {
                'lstm_score': float(lstm_score),
                'pyod_score': float(pyod_score),
                'consensus_score': float(consensus_score),
                'lstm_detected': bool(results['lstm_anomalies'][date_idx]),
                'pyod_detected': bool(results['pyod_anomalies'][date_idx]),
                'consensus_detected': bool(consensus_anomalies[date_idx])
            }

            # Determine severity
            if consensus_anomalies[date_idx]:
                anomaly_analysis['anomalous_dates'].append(date_str)

                if lstm_score >= 1.5 and pyod_score >= 0.8:
                    severity = 'CRITICAL'
                elif consensus_score >= 0.8:
                    severity = 'HIGH'
                elif consensus_score >= 0.6:
                    severity = 'MEDIUM'
                else:
                    severity = 'LOW'

                anomaly_analysis['anomaly_severity'][date_str] = severity
                print(f"   {date_str}: {severity} anomaly (consensus: {consensus_score:.3f})")
            else:
                print(f"   {date_str}: Normal (consensus: {consensus_score:.3f})")

        # Calculate statistics
        test_lstm_scores = [results['lstm_scores'][i] for i in test_indices]
        test_pyod_scores = [results['pyod_scores'][i] for i in test_indices]
        test_consensus_scores = [results['consensus_scores'][i] for i in test_indices]

        anomaly_analysis['score_statistics'] = {
            'lstm_scores': {
                'min': float(np.min(test_lstm_scores)),
                'max': float(np.max(test_lstm_scores)),
                'mean': float(np.mean(test_lstm_scores)),
                'std': float(np.std(test_lstm_scores))
            },
            'pyod_scores': {
                'min': float(np.min(test_pyod_scores)),
                'max': float(np.max(test_pyod_scores)),
                'mean': float(np.mean(test_pyod_scores)),
                'std': float(np.std(test_pyod_scores))
            },
            'consensus_scores': {
                'min': float(np.min(test_consensus_scores)),
                'max': float(np.max(test_consensus_scores)),
                'mean': float(np.mean(test_consensus_scores)),
                'std': float(np.std(test_consensus_scores))
            }
        }

        # Summary stats
        anomaly_analysis['summary_stats'] = {
            'total_anomalies': len(anomaly_analysis['anomalous_dates']),
            'anomaly_rate': len(anomaly_analysis['anomalous_dates']) / len(test_indices) if test_indices else 0,
            'critical_count': sum(1 for s in anomaly_analysis['anomaly_severity'].values() if s == 'CRITICAL'),
            'high_severity_count': sum(1 for s in anomaly_analysis['anomaly_severity'].values() if s == 'HIGH'),
            'target_columns_analyzed': target_columns,
            'thresholds_used': {
                'lstm_threshold': config['lstm_score_threshold'],
                'pyod_threshold': config['pyod_score_threshold'],
                'consensus_threshold': config['consensus_threshold']
            }
        }

        return anomaly_analysis

    def _get_target_features(self, df: pd.DataFrame, target_columns: List[str]) -> List[str]:
        """Get relevant features for target columns."""
        relevant_features = []

        for target_col in target_columns:
            col_features = [col for col in df.columns
                          if col.startswith(f"{target_col}_") and col != 'date']

            numeric_features = [f for f in col_features
                              if pd.api.types.is_numeric_dtype(df[f])]
            relevant_features.extend(numeric_features)

        return sorted(list(set(relevant_features)))

    def _prepare_model_features(self, df: pd.DataFrame, timestamp_col: str,
                               feature_columns: List[str]) -> Tuple[np.ndarray, np.ndarray, List[str]]:
        """Prepare features for models."""

        # Get numeric features
        numeric_features = []
        for col in feature_columns:
            if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
                numeric_features.append(col)

        if not numeric_features:
            raise ValueError("No numeric features found")

        # Extract and clean data
        X = df[numeric_features].values.astype(np.float64)
        dates = df[timestamp_col].values

        # Handle NaN/inf
        X = np.nan_to_num(X, nan=0.0, posinf=1e10, neginf=-1e10)

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        print(f"Prepared {X_scaled.shape[0]} samples with {X_scaled.shape[1]} features")

        return X_scaled, dates, numeric_features

**Instruction**

# Enhanced Dataset Quality Anomaly Detector - Usage Guide

## Overview
Multi-algorithm anomaly detection system combining LSTM sequence analysis with PyOD statistical methods for dataset quality monitoring.

---

## 1. Initialize Detector

```python
detector = EnhancedDatasetQualityAnomalyDetector(
    default_config={
        'seq_length': 14,                    # LSTM lookback window (days)
        'contamination': 0.02,               # Expected anomaly rate (2%)
        'pyod_method': 'ecod',               # PyOD algorithm: 'ecod', 'copod', 'lof'
        'use_autoencoder': False,            # Enable deep learning (optional)
        'lstm_score_threshold': 1.0,         # LSTM anomaly threshold
        'pyod_score_threshold': 0.7,         # PyOD anomaly threshold
        'consensus_weight_lstm': 0.6,        # LSTM weight in consensus (60%)
        'consensus_weight_pyod': 0.4,        # PyOD weight in consensus (40%)
        'consensus_threshold': 0.75,         # Final anomaly threshold
        'hidden_dim': 64,                    # Neural network size
        'epochs': 50                         # Training iterations
    }
)
```

---

## 2. Run Detection

```python
results = detector.detect_dataset_anomalies(
    df=transformed_datatype_df,              # Your prepared DataFrame
    mode='specific_dates',                   # Detection mode
    target_dates=['2023-02-27'],             # Dates to analyze
    target_columns=['aimp', 'amud', 'arnd']  # Columns to monitor
)
```

### Parameters:
- **df**: DataFrame with date column + numeric features
- **mode**: `'specific_dates'` or `'rolling_window'`
- **target_dates**: List of dates to check (uses latest 7 if None)
- **target_columns**: Columns to analyze (auto-detected if None)

---

## 3. Access Results

### Individual Date Scores
```python
# Get anomaly scores for specific date
if '2023-02-27' in results.get('anomaly_scores', {}):
    scores = results['anomaly_scores']['2023-02-27']
    print(f'LSTM Score: {scores["lstm_score"]:.3f}')
    print(f'PyOD Score: {scores["pyod_score"]:.3f}')
    print(f'Consensus Score: {scores["consensus_score"]:.3f}')
    print(f'Final Decision: {"ANOMALY" if scores["consensus_detected"] else "NORMAL"}')
```

### Summary Statistics
```python
# Get summary statistics
print(f'Total anomalies: {results["summary_stats"]["total_anomalies"]}')
print(f'Anomaly rate: {results["summary_stats"]["anomaly_rate"]*100:.1f}%')
print(f'Critical alerts: {results["summary_stats"]["critical_count"]}')
```

### Severity Levels
```python
# Check severity classifications
for date, severity in results['anomaly_severity'].items():
    print(f'{date}: {severity}')
    # CRITICAL: lstm>=1.5 + pyod>=0.8
    # HIGH: consensus>=0.8
    # MEDIUM: consensus>=0.6
    # LOW: above threshold
```

---

## 4. Result Structure

```python
results = {
    'anomalous_dates': ['2023-02-27'],           # List of anomalous dates
    'anomaly_scores': {                          # Detailed scores per date
        '2023-02-27': {
            'lstm_score': 1.45,
            'pyod_score': 0.82,
            'consensus_score': 0.89,
            'consensus_detected': True
        }
    },
    'anomaly_severity': {                        # Severity classification
        '2023-02-27': 'HIGH'
    },
    'summary_stats': {                           # Overall statistics
        'total_anomalies': 1,
        'anomaly_rate': 0.14,
        'critical_count': 0,
        'high_severity_count': 1
    },
    'score_statistics': {                        # Score distributions
        'lstm_scores': {'min': 0.45, 'max': 1.45, 'mean': 0.82},
        'pyod_scores': {'min': 0.12, 'max': 0.82, 'mean': 0.41}
    }
}
```

---

## 5. Configuration Tips

### For Sensitive Detection (catch more anomalies):
```python
config = {
    'lstm_score_threshold': 0.8,        # Lower threshold
    'consensus_threshold': 0.6,         # Lower consensus
    'contamination': 0.05               # Higher expected rate
}
```

### For Conservative Detection (fewer false alarms):
```python
config = {
    'lstm_score_threshold': 1.2,        # Higher threshold
    'consensus_threshold': 0.8,         # Higher consensus
    'contamination': 0.01               # Lower expected rate
}
```

---


## 7. Quick Start Example

```python
# Initialize with default settings
detector = EnhancedDatasetQualityAnomalyDetector()

# Run detection on latest data
results = detector.detect_dataset_anomalies(
    df=your_dataframe,
    target_dates=['2023-02-27']
)

# Get simple yes/no answer
is_anomaly = '2023-02-27' in results['anomalous_dates']
print(f"2023-02-27 is {'ANOMALOUS' if is_anomaly else 'NORMAL'}")
```


**LSTM Execution**

# Parameter Configuration Guide

## LSTM Parameters

### `seq_length: 14`
**Purpose**: Historical window size for sequence analysis  
**How it works**: LSTM examines 14 consecutive days to predict the next day's pattern  
**Trade-offs**:
- Higher values (21-30) = More context, better trend detection, slower processing
- Lower values (7-10) = Faster processing, may miss long-term patterns
- Recommended: 14 days captures 2-week business cycles

### `lstm_score_threshold: 1.0`
**Purpose**: Anomaly detection cutoff for LSTM predictions  
**How it works**: Reconstruction error ≥ 1.0 flags as anomaly (1.0 = baseline threshold)  
**Sensitivity tuning**:
- **More sensitive** (0.8): Catches subtle anomalies, more false positives
- **Less sensitive** (1.2): Only major anomalies, fewer false positives
- **Default** (1.0): Balanced detection based on 95th percentile training errors

---

## PyOD Statistical Parameters

### `contamination: 0.02`
**Purpose**: Expected percentage of anomalous data points  
**How it works**: Tells PyOD algorithms that ~2% of data should be outliers  
**Guidelines**:
- **High quality data** (0.01): Very few expected anomalies
- **Moderate quality** (0.02): Standard business data assumption
- **Poor quality data** (0.05): Higher anomaly rate expected

### `pyod_score_threshold: 0.7`
**Purpose**: Statistical outlier detection cutoff  
**How it works**: PyOD scores ≥ 0.7 considered anomalous  
**Sensitivity tuning**:
- **More sensitive** (0.5): Detects borderline statistical outliers
- **Less sensitive** (0.8): Only clear statistical anomalies
- **Auto-learned**: Can use PyOD's built-in threshold instead

---

## Consensus Ensemble Scoring

### `consensus_weight_lstm: 0.6` & `consensus_weight_pyod: 0.4`
**Purpose**: Relative importance of each detection method  
**How it works**: Final score = (0.6 × LSTM) + (0.4 × PyOD)  
**When to adjust**:
- **Time-series focus** (0.7/0.3): Emphasize sequence patterns
- **Statistical focus** (0.4/0.6): Emphasize outlier detection
- **Balanced** (0.5/0.5): Equal weight to both approaches

### `consensus_threshold: 0.75`
**Purpose**: Final decision boundary for anomaly classification  
**How it works**: Combined weighted score ≥ 0.75 = anomaly  
**Decision impact**:
- **Conservative** (0.8): Requires strong agreement between methods
- **Moderate** (0.75): Standard threshold for business applications
- **Aggressive** (0.6): Flags more potential issues for investigation

---

## Model Architecture

### `hidden_dim: 64`
**Purpose**: LSTM neural network complexity  
**How it works**: Number of neurons in hidden layers  
**Resource trade-offs**:
- **Lightweight** (32): Faster training, simpler patterns
- **Standard** (64): Good balance for most datasets
- **Complex** (128): Captures intricate patterns, slower training

### `epochs: 50`
**Purpose**: Training iteration count  
**How it works**: Number of complete passes through training data  
**Training balance**:
- **Quick** (25): Fast training, may underfit
- **Standard** (50): Adequate learning for most cases
- **Thorough** (100): Better convergence, risk of overfitting

---

## Tuning Recommendations

### For High-Sensitivity Detection
```python
config = {
    'lstm_score_threshold': 0.8,      # Lower threshold
    'pyod_score_threshold': 0.6,      # More sensitive
    'consensus_threshold': 0.65,      # Easier to trigger
    'contamination': 0.03             # Expect more anomalies
}
```

### For Conservative Detection  
```python
config = {
    'lstm_score_threshold': 1.2,      # Higher threshold
    'pyod_score_threshold': 0.8,      # Less sensitive
    'consensus_threshold': 0.8,       # Require strong signal
    'contamination': 0.01             # Expect fewer anomalies
}
```

### For Performance Optimization
```python
config = {
    'seq_length': 10,                 # Shorter sequences
    'hidden_dim': 32,                 # Smaller network
    'epochs': 25,                     # Faster training
    'pyod_method': 'ecod'            # Fastest PyOD algorithm
}
```

**Performance**

**More Sensitive Detection:**

*  'lstm_score_threshold': 0.8,     # Lower threshold
*   'pyod_score_threshold': 0.5,     # Lower threshold
*   'contamination': 0.05,           # Expect more anomalies
*   'consensus_threshold': 0.6       # Lower combined threshold

**More Conservative Detection:**

*   'lstm_score_threshold': 1.5,     # Higher threshold
*   'contamination': 0.01,           # Expect fewer anomalies
*   'consensus_threshold': 0.9       # Higher combined threshold

**Performance vs Accuracy:**

*   'seq_length': 7,        # Faster but less context
*   'hidden_dim': 32,       # Faster but simpler model
*   'epochs': 25            # Faster training

**Trust LSTM more:**
*   'consensus_weight_lstm': 0.8,
*   'consensus_weight_pyod': 0.2





In [None]:
detector = EnhancedDatasetQualityAnomalyDetector(
    default_config={
        'seq_length': 14,
        'contamination': 0.02,
        'pyod_method': 'ecod',
        'use_autoencoder': True,
        'lstm_score_threshold': 1.0,
        'pyod_score_threshold': 0.7,
        'consensus_weight_lstm': 0.6,
        'consensus_weight_pyod': 0.4,
        'consensus_threshold': 0.75,
        'hidden_dim': 64,
        'epochs': 50
    }
)

Initializing Enhanced Dataset Quality Anomaly Detector...
Configuration:
   • LSTM sequence length: 14 days
   • PyOD algorithm: ECOD
   • Use autoencoder: True
   • Thresholds: LSTM=1.0, PyOD=0.7
   • Consensus weights: LSTM=0.6, PyOD=0.4
Detector ready


In [None]:
lstm_pyod_results = detector.detect_dataset_anomalies(
    df=transformed_datatype_df,
    mode='specific_dates',
    target_dates=['2023-02-27'],
    target_columns=['aimp', 'amud', 'arnd']
)


ENHANCED DATASET ANOMALY DETECTION
Input shape: (58, 201)
Mode: specific_dates
Date range: 2023-01-01 to 2023-02-27 (58 days)
Target dates: ['2023-02-27']
Target columns: ['aimp', 'amud', 'arnd']
Found 200 relevant features
Prepared 58 samples with 200 features
Training: 57 days, Testing: 1 days

Enhanced Detection Pipeline Starting...

1. LSTM SEQUENCE ANOMALY DETECTION
LSTM Scorer initialized: seq_length=14, threshold_percentile=95.0
Learning LSTM baseline from 57 training samples...
LSTM baseline threshold: 2.0867
Scoring 58 sequences with LSTM...
LSTM scoring complete. Anomalies detected: 3

2. PyOD STATISTICAL ANOMALY DETECTION
Running REAL PyOD detection with ECOD...
   PyOD not available, using fallback: No module named 'pyod'
   Using statistical fallback for PyOD scores...

3. APPLYING THRESHOLDS
LSTM anomalies: 3 (threshold: 1.0)
PyOD anomalies: 4 (threshold: 0.700000)

4. CONSENSUS ENSEMBLE SCORING
Consensus anomalies: 2 (threshold: 0.75)
Combined anomalies: 7
Analyzing enh

**Apply user defined threshold**

In [None]:
from typing import Dict, Any

class SimpleAnomalyController:
    """
    Simple sensitivity control for anomaly detection.
    App teams provide a single sensitivity level to adjust detection behavior.
    """

    def __init__(self):
        print("Simple Anomaly Controller initialized")

    def apply_sensitivity_control(
        self,
        lstm_pyod_results: Dict[str, Any],
        sensitivity_level: str,  # 'very_low', 'low', 'medium', 'high', 'very_high'
        target_date: str
    ) -> Dict[str, Any]:
        """
        Apply sensitivity control with single parameter.

        Args:
            lstm_pyod_results: Detection results from anomaly detector
            sensitivity_level: Controls detection sensitivity
                - 'very_low': Only critical anomalies (10x threshold)
                - 'low': Conservative detection (5x threshold)
                - 'medium': Moderate sensitivity (2x threshold)
                - 'high': Default sensitivity (1x threshold)
                - 'very_high': Catches minor anomalies (0.5x threshold)
            target_date: Date to analyze

        Returns:
            Updated results with new anomaly decision
        """

        print(f"Applying {sensitivity_level.upper()} sensitivity for {target_date}")

        # Define sensitivity configurations
        sensitivity_configs = {
            'very_high': {
                'threshold_multiplier': 0.5,
                'description': 'Very sensitive - detects minor anomalies'
            },
            'high': {
                'threshold_multiplier': 1.0,
                'description': 'High sensitivity - default detection level'
            },
            'medium': {
                'threshold_multiplier': 2.0,
                'description': 'Medium sensitivity - reduces false positives'
            },
            'low': {
                'threshold_multiplier': 5.0,
                'description': 'Low sensitivity - only major anomalies'
            },
            'very_low': {
                'threshold_multiplier': 10.0,
                'description': 'Very low sensitivity - minimal alerts'
            }
        }

        # Validate sensitivity level
        if sensitivity_level not in sensitivity_configs:
            available_levels = list(sensitivity_configs.keys())
            print(f"Invalid sensitivity level. Available options: {available_levels}")
            return lstm_pyod_results

        # Get configuration for selected sensitivity
        config = sensitivity_configs[sensitivity_level]
        threshold_multiplier = config['threshold_multiplier']
        description = config['description']

        print(f"Configuration: {description}")

        # Calculate adjusted consensus threshold
        original_consensus_threshold = 0.75  # Base threshold from model
        adjusted_consensus_threshold = original_consensus_threshold * threshold_multiplier

        # Apply threshold adjustment to target date
        if target_date in lstm_pyod_results.get('anomaly_scores', {}):
            date_scores = lstm_pyod_results['anomaly_scores'][target_date]

            # Calculate new anomaly decision based on adjusted threshold
            original_decision = date_scores['consensus_detected']
            adjusted_decision = date_scores['consensus_score'] >= adjusted_consensus_threshold

            # Create updated results
            updated_results = lstm_pyod_results.copy()
            updated_results['anomaly_scores'][target_date]['consensus_detected'] = adjusted_decision

            # Update anomalous dates list
            if adjusted_decision and target_date not in updated_results['anomalous_dates']:
                updated_results['anomalous_dates'].append(target_date)
            elif not adjusted_decision and target_date in updated_results['anomalous_dates']:
                updated_results['anomalous_dates'].remove(target_date)

            # Log decision comparison
            original_status = "ANOMALY" if original_decision else "NORMAL"
            adjusted_status = "ANOMALY" if adjusted_decision else "NORMAL"

            print(f"Consensus score: {date_scores['consensus_score']:.3f}")
            print(f"Original threshold ({original_consensus_threshold}): {original_status}")
            print(f"Adjusted threshold ({adjusted_consensus_threshold:.3f}): {adjusted_status}")

            # Log override information
            if original_status != adjusted_status:
                if original_status == "ANOMALY" and adjusted_status == "NORMAL":
                    print("USER OVERRIDE: Model detected anomaly, user tolerance treats as normal")
                else:
                    print("USER OVERRIDE: Model missed anomaly, user sensitivity caught it")

            # Update metadata
            updated_results['summary_stats']['thresholds_used']['consensus_threshold'] = adjusted_consensus_threshold
            updated_results['summary_stats']['sensitivity_level'] = sensitivity_level

            return updated_results

        else:
            print(f"No anomaly scores found for date: {target_date}")
            return lstm_pyod_results

    def get_sensitivity_recommendation(self, use_case: str) -> str:
        """
        Recommend sensitivity level based on business use case.

        Args:
            use_case: Business context for anomaly detection

        Returns:
            Recommended sensitivity level
        """

        use_case_recommendations = {
            'financial_fraud_detection': 'high',
            'system_performance_monitoring': 'medium',
            'data_quality_monitoring': 'medium',
            'security_incident_detection': 'very_high',
            'business_kpi_monitoring': 'low',
            'operational_metrics': 'medium',
            'customer_behavior_analysis': 'low',
            'network_traffic_monitoring': 'high'
        }

        recommended_level = use_case_recommendations.get(use_case, 'medium')
        print(f"Recommended sensitivity for '{use_case}': {recommended_level}")

        return recommended_level


class PercentageAnomalyController:
    """
    Control anomaly detection using percentage-based tolerance.
    Simpler interface for app teams who prefer percentage values.
    """

    def __init__(self):
        print("Percentage Anomaly Controller initialized")

    def apply_tolerance_percentage(
        self,
        lstm_pyod_results: Dict[str, Any],
        tolerance_percent: float,  # 0-100: higher = less sensitive
        target_date: str
    ) -> Dict[str, Any]:
        """
        Apply tolerance using percentage scale.

        Args:
            lstm_pyod_results: Detection results from anomaly detector
            tolerance_percent: Tolerance level as percentage
                - 0%: Very sensitive (detects minor anomalies)
                - 25%: High sensitivity
                - 50%: Medium sensitivity (balanced)
                - 75%: Low sensitivity (conservative)
                - 100%: Very low sensitivity (minimal alerts)
            target_date: Date to analyze

        Returns:
            Updated results with new anomaly decision
        """

        print(f"Applying {tolerance_percent}% tolerance for {target_date}")

        # Validate tolerance percentage
        if not 0 <= tolerance_percent <= 100:
            print("Tolerance percentage must be between 0 and 100")
            return lstm_pyod_results

        # Convert percentage to threshold multiplier
        # 0% -> 0.5x (more sensitive), 100% -> 10x (less sensitive)
        threshold_multiplier = 0.5 + (tolerance_percent / 100) * 9.5

        # Calculate adjusted threshold
        original_threshold = 0.75
        adjusted_threshold = original_threshold * threshold_multiplier

        # Apply threshold adjustment
        if target_date in lstm_pyod_results.get('anomaly_scores', {}):
            date_scores = lstm_pyod_results['anomaly_scores'][target_date]

            # Calculate new decision
            original_decision = date_scores['consensus_detected']
            adjusted_decision = date_scores['consensus_score'] >= adjusted_threshold

            # Create updated results
            updated_results = lstm_pyod_results.copy()
            updated_results['anomaly_scores'][target_date]['consensus_detected'] = adjusted_decision

            # Update anomalous dates list
            if adjusted_decision and target_date not in updated_results['anomalous_dates']:
                updated_results['anomalous_dates'].append(target_date)
            elif not adjusted_decision and target_date in updated_results['anomalous_dates']:
                updated_results['anomalous_dates'].remove(target_date)

            # Log decision details
            original_status = "ANOMALY" if original_decision else "NORMAL"
            adjusted_status = "ANOMALY" if adjusted_decision else "NORMAL"

            print(f"Consensus score: {date_scores['consensus_score']:.3f}")
            print(f"Tolerance threshold: {adjusted_threshold:.3f}")
            print(f"Original decision: {original_status}")
            print(f"Tolerance decision: {adjusted_status}")

            # Log override information
            if original_decision and not adjusted_decision:
                print(f"USER TOLERANCE: Model detected anomaly, {tolerance_percent}% tolerance treats as normal")
            elif not original_decision and adjusted_decision:
                print(f"USER TOLERANCE: Model missed anomaly, {tolerance_percent}% tolerance caught it")

            # Update metadata
            updated_results['summary_stats']['thresholds_used']['consensus_threshold'] = adjusted_threshold
            updated_results['summary_stats']['tolerance_percent'] = tolerance_percent

            return updated_results

        else:
            print(f"No anomaly scores found for date: {target_date}")
            return lstm_pyod_results

    def get_tolerance_recommendation(self, business_impact: str) -> float:
        """
        Recommend tolerance percentage based on business impact.

        Args:
            business_impact: Level of business impact from false positives

        Returns:
            Recommended tolerance percentage
        """

        impact_recommendations = {
            'critical': 20.0,      # Low tolerance, catch more anomalies
            'high': 40.0,          # Moderate tolerance
            'medium': 60.0,        # Balanced approach
            'low': 80.0,           # Higher tolerance, fewer alerts
            'minimal': 90.0        # Very high tolerance, critical only
        }

        recommended_tolerance = impact_recommendations.get(business_impact, 60.0)
        print(f"Recommended tolerance for '{business_impact}' impact: {recommended_tolerance}%")

        return recommended_tolerance

**Manu Model result override**

**# STEP-BY-STEP USAGE GUIDE**
"""
PRODUCTION USAGE FLOW:

*STEP 1: Initialize the manager*
manager = ProductionAnomalyManager()

*STEP 2: Run your existing detection (no changes needed)*
lstm_pyod_results = detector.detect_dataset_anomalies(...)

*STEP 3: Process with user decision (ONE SIMPLE CALL)*
final_results = manager.process_anomaly_detection(
    lstm_pyod_results=lstm_pyod_results,
    target_date='2023-02-27',
    user_action='force_normal',  # Single control value
    user_reason='Known data migration - not a real anomaly',
    user_id='analyst@company.com'
)

*STEP 4: Use final_results for downstream processing*
decision = final_results['anomaly_scores']['2023-02-27']['consensus_detected']

*STEP 5: Get audit trail when needed*
audit_df = manager.get_override_audit_trail()

USER_ACTION VALUES:
- 'accept_model': Use whatever model decided
- 'force_normal': Force as normal (treat as false positive)
- 'force_anomaly': Force as anomaly (treat as missed detection)
"""

In [None]:
import pandas as pd
from typing import Dict, List, Any

class AnomalyOverrideSystem:
    """
    Manual override system for production anomaly detection.
    Allows users to force classifications regardless of model scores.
    """

    def __init__(self):
        self.override_log = []
        print("Manual Override System Initialized")

    def apply_manual_override(
        self,
        lstm_pyod_results: Dict[str, Any],
        target_date: str,
        override_decision: str,  # 'force_normal', 'force_anomaly', 'model_decision'
        user_reason: str = "",
        user_id: str = "system"
    ) -> Dict[str, Any]:
        """
        Apply manual override regardless of model scores.

        Args:
            lstm_pyod_results: Detection results from model
            target_date: Date to override (e.g., '2023-02-27')
            override_decision: 'force_normal', 'force_anomaly', 'model_decision'
            user_reason: Business justification for override
            user_id: Who made the override decision

        Returns:
            Updated results with override applied
        """

        if target_date not in lstm_pyod_results.get('anomaly_scores', {}):
            print(f"Error: No results found for {target_date}")
            return lstm_pyod_results

        # Get original model decision
        original_decision = lstm_pyod_results['anomaly_scores'][target_date]['consensus_detected']

        # Determine new decision based on override
        if override_decision == 'force_normal':
            new_decision = False
            override_type = "FORCE NORMAL"
        elif override_decision == 'force_anomaly':
            new_decision = True
            override_type = "FORCE ANOMALY"
        else:  # model_decision
            new_decision = original_decision
            override_type = "MODEL DECISION"

        # Update results with override information
        updated_results = lstm_pyod_results.copy()
        updated_results['anomaly_scores'][target_date]['consensus_detected'] = new_decision
        updated_results['anomaly_scores'][target_date]['manual_override'] = True
        updated_results['anomaly_scores'][target_date]['override_type'] = override_type
        updated_results['anomaly_scores'][target_date]['override_reason'] = user_reason
        updated_results['anomaly_scores'][target_date]['override_user'] = user_id

        # Update anomalous dates list
        if new_decision and target_date not in updated_results['anomalous_dates']:
            updated_results['anomalous_dates'].append(target_date)
        elif not new_decision and target_date in updated_results['anomalous_dates']:
            updated_results['anomalous_dates'].remove(target_date)

        # Create audit log entry
        override_entry = {
            'date': target_date,
            'original_decision': 'ANOMALY' if original_decision else 'NORMAL',
            'override_decision': 'ANOMALY' if new_decision else 'NORMAL',
            'override_type': override_type,
            'user_reason': user_reason,
            'user_id': user_id,
            'timestamp': pd.Timestamp.now(),
            'model_scores': {
                'lstm': updated_results['anomaly_scores'][target_date]['lstm_score'],
                'pyod': updated_results['anomaly_scores'][target_date]['pyod_score'],
                'consensus': updated_results['anomaly_scores'][target_date]['consensus_score']
            }
        }

        self.override_log.append(override_entry)

        # Log override action
        print(f"\nMANUAL OVERRIDE APPLIED for {target_date}")
        print("=" * 50)
        print(f"Model Decision: {'ANOMALY' if original_decision else 'NORMAL'}")
        print(f"User Override: {override_type}")
        print(f"Final Decision: {'ANOMALY' if new_decision else 'NORMAL'}")
        print(f"Reason: {user_reason}")
        print(f"User: {user_id}")
        print("=" * 50)

        return updated_results

    def get_override_history(self) -> pd.DataFrame:
        """Get complete history of all manual overrides."""
        return pd.DataFrame(self.override_log)

    def validate_override_request(
        self,
        lstm_pyod_results: Dict[str, Any],
        target_date: str,
        override_decision: str
    ) -> Dict[str, Any]:
        """
        Validate override request and provide impact assessment.

        Args:
            lstm_pyod_results: Detection results
            target_date: Date to validate
            override_decision: Proposed override action

        Returns:
            Validation results with recommendations
        """

        if target_date not in lstm_pyod_results.get('anomaly_scores', {}):
            return {"valid": False, "reason": "Date not found in results"}

        scores = lstm_pyod_results['anomaly_scores'][target_date]
        current_decision = scores['consensus_detected']

        validation = {
            "valid": True,
            "current_decision": "ANOMALY" if current_decision else "NORMAL",
            "requested_override": override_decision,
            "impact_assessment": self._assess_override_impact(scores, override_decision),
            "model_confidence": self._get_model_confidence(scores),
            "recommendation": self._get_override_recommendation(scores, override_decision)
        }

        return validation

    def _assess_override_impact(self, scores: Dict, override_decision: str) -> str:
        """Assess the business impact of the proposed override."""

        consensus_score = scores['consensus_score']

        if override_decision == 'force_normal':
            if consensus_score > 1000:
                return "HIGH IMPACT: Overriding very strong anomaly signal"
            elif consensus_score > 100:
                return "MEDIUM IMPACT: Overriding moderate anomaly signal"
            else:
                return "LOW IMPACT: Overriding weak anomaly signal"
        else:
            return "Forcing anomaly classification"

    def _get_model_confidence(self, scores: Dict) -> str:
        """Determine model confidence level."""

        both_agree = scores['lstm_detected'] and scores['pyod_detected']
        any_detected = scores['lstm_detected'] or scores['pyod_detected']

        if both_agree:
            return "HIGH (Both algorithms agree)"
        elif any_detected:
            return "MEDIUM (One algorithm detected)"
        else:
            return "LOW (No algorithms detected)"

    def _get_override_recommendation(self, scores: Dict, override_decision: str) -> str:
        """Provide recommendation on whether override is advisable."""

        consensus_score = scores['consensus_score']

        if override_decision == 'force_normal':
            if consensus_score > 500:
                return "NOT RECOMMENDED: Very strong anomaly signal"
            elif consensus_score > 100:
                return "CAUTION: Moderate anomaly signal - ensure business justification"
            else:
                return "ACCEPTABLE: Weak anomaly signal"

        return "Manual anomaly classification"

class ProductionAnomalyManager:
    """
    Complete production manager combining model detection with manual overrides.
    Single entry point for all anomaly processing decisions.
    """

    def __init__(self):
        self.override_system = AnomalyOverrideSystem()
        print("Production Anomaly Manager Ready")

    def process_anomaly_detection(
        self,
        lstm_pyod_results: Dict[str, Any],
        target_date: str,
        user_action: str = 'accept_model',  # 'accept_model', 'force_normal', 'force_anomaly'
        user_reason: str = "",
        user_id: str = "system"
    ) -> Dict[str, Any]:
        """
        Complete workflow: Model detection + User decision + Override if needed.

        Args:
            lstm_pyod_results: Results from LSTM+PyOD detection
            target_date: Date being processed
            user_action: What action user wants to take
            user_reason: Business justification for override
            user_id: User making the decision

        Returns:
            Final results with user decision applied
        """

        print(f"\nPROCESSING ANOMALY DETECTION for {target_date}")
        print("=" * 60)

        # Display model analysis
        if target_date in lstm_pyod_results.get('anomaly_scores', {}):
            scores = lstm_pyod_results['anomaly_scores'][target_date]

            print("MODEL ANALYSIS:")
            print(f"   LSTM Score: {scores['lstm_score']:.2f} -> {'ANOMALY' if scores['lstm_detected'] else 'NORMAL'}")
            print(f"   PyOD Score: {scores['pyod_score']:.2f} -> {'ANOMALY' if scores['pyod_detected'] else 'NORMAL'}")
            print(f"   Consensus: {scores['consensus_score']:.2f} -> {'ANOMALY' if scores['consensus_detected'] else 'NORMAL'}")

            model_decision = 'ANOMALY' if scores['consensus_detected'] else 'NORMAL'
            print(f"\nMODEL RECOMMENDATION: {model_decision}")

            # Process user action
            if user_action == 'accept_model':
                print("USER ACTION: Accepted model decision")
                return lstm_pyod_results

            elif user_action in ['force_normal', 'force_anomaly']:
                # Show validation before applying override
                validation = self.override_system.validate_override_request(
                    lstm_pyod_results, target_date, user_action
                )

                print("\nOVERRIDE VALIDATION:")
                print(f"   Impact: {validation['impact_assessment']}")
                print(f"   Model Confidence: {validation['model_confidence']}")
                print(f"   Recommendation: {validation['recommendation']}")

                # Apply the override
                final_results = self.override_system.apply_manual_override(
                    lstm_pyod_results, target_date, user_action, user_reason, user_id
                )

                return final_results

        return lstm_pyod_results

    def get_override_audit_trail(self) -> pd.DataFrame:
        """Get complete audit trail of all manual overrides."""
        return self.override_system.get_override_history()



**On DEMAND EXECUTION FAIL OVER MECHANISSM**

In this whole proces column value gettting modified manually

*Before override*
'consensus_detected': True  # ANOMALY

*After override*
'consensus_detected': False  # NORMAL

In [None]:
manager = ProductionAnomalyManager()

Manual Override System Initialized
Production Anomaly Manager Ready


In [None]:
final_results = manager.process_anomaly_detection(
    lstm_pyod_results=lstm_pyod_results,
    target_date='2023-02-27',
    user_action='force_normal',  # SINGLE CONTROL VALUE
    user_reason='Known data migration',
    user_id='gcganamfrmu'
)


PROCESSING ANOMALY DETECTION for 2023-02-27
MODEL ANALYSIS:
   LSTM Score: 0.93 -> NORMAL
   PyOD Score: 1.00 -> ANOMALY
   Consensus: 0.96 -> ANOMALY

MODEL RECOMMENDATION: ANOMALY

OVERRIDE VALIDATION:
   Impact: LOW IMPACT: Overriding weak anomaly signal
   Model Confidence: MEDIUM (One algorithm detected)
   Recommendation: ACCEPTABLE: Weak anomaly signal

MANUAL OVERRIDE APPLIED for 2023-02-27
Model Decision: ANOMALY
User Override: FORCE NORMAL
Final Decision: NORMAL
Reason: Known data migration
User: gcganamfrmu


In [None]:
decision = final_results['anomaly_scores']['2023-02-27']['consensus_detected']
print(f"Final decision: {'ANOMALY' if decision else 'NORMAL'}")

Final decision: NORMAL


In [None]:
decision_df = create_anomaly_explanation_dataframe(
    lstm_pyod_results=final_results,
    original_df=transformed_datatype_df,  # original dataframe
    target_date='2023-02-27',
    target_columns=['aimp', 'amud', 'arnd']
)

Creating explainability for 2023-02-27...
Historical data: 57 days
Target date data: 2023-02-27
Analyzing 200 features
Created explanations for 200 features


In [None]:
decision_df['consensus_detected'].head(1)

Unnamed: 0,consensus_detected
0,False


**Explainable DataFrame**

In [None]:
import pandas as pd
import numpy as np
from typing import Dict, List, Any

def create_anomaly_explanation_dataframe(
    lstm_pyod_results: Dict[str, Any],
    original_df: pd.DataFrame,
    target_date: str,
    target_columns: List[str]
) -> pd.DataFrame:
    """
    Create explainable DataFrame from anomaly detection results.

    Args:
        lstm_pyod_results: Results from your detector
        original_df: Original dataframe with raw values
        target_date: Date being analyzed (e.g., '2023-02-27')
        target_columns: Columns analyzed (e.g., ['aimp', 'amud', 'arnd'])

    Returns:
        DataFrame with detailed explanations
    """

    print(f"Creating explainability for {target_date}...")

    if target_date not in lstm_pyod_results.get('anomaly_scores', {}):
        print(f"No results found for {target_date}")
        return pd.DataFrame()

    date_scores = lstm_pyod_results['anomaly_scores'][target_date]

    target_row = original_df[original_df['date'] == target_date]
    if target_row.empty:
        print(f"Target date {target_date} not found in original data")
        return pd.DataFrame()

    target_row = target_row.iloc[0]
    historical_df = original_df[original_df['date'] < target_date]

    print(f"Historical data: {len(historical_df)} days")
    print(f"Target date data: {target_date}")

    relevant_features = []
    for col in target_columns:
        features = [f for f in original_df.columns if f.startswith(f"{col}_")]
        relevant_features.extend(features)

    print(f"Analyzing {len(relevant_features)} features")

    explanations = []

    for feature in relevant_features:
        if feature in original_df.columns and feature in target_row.index:

            current_value = target_row[feature]

            if len(historical_df) > 0 and feature in historical_df.columns:
                hist_values = historical_df[feature].dropna()

                if len(hist_values) > 0:
                    hist_mean = hist_values.mean()
                    hist_std = hist_values.std()
                    hist_min = hist_values.min()
                    hist_max = hist_values.max()

                    z_score = (current_value - hist_mean) / (hist_std + 1e-8)
                    deviation_pct = ((current_value - hist_mean) / (hist_mean + 1e-8)) * 100
                    percentile = (hist_values <= current_value).mean() * 100

                    severity = determine_severity(z_score, deviation_pct)
                    feature_contribution = abs(z_score) / 10

                    explanation_text = generate_feature_explanation(
                        feature, current_value, hist_mean, z_score, deviation_pct, severity
                    )

                    explanation = {
                        'date': target_date,
                        'feature_name': feature,
                        'base_column': extract_base_column(feature),
                        'metric_type': extract_metric_type(feature),
                        'current_value': round(current_value, 6),
                        'historical_mean': round(hist_mean, 6),
                        'historical_std': round(hist_std, 6),
                        'historical_min': round(hist_min, 6),
                        'historical_max': round(hist_max, 6),
                        'z_score': round(z_score, 3),
                        'deviation_percentage': round(deviation_pct, 2),
                        'percentile_rank': round(percentile, 1),
                        'severity': severity,
                        'feature_contribution_score': round(feature_contribution, 3),
                        'lstm_score': date_scores['lstm_score'],
                        'pyod_score': date_scores['pyod_score'],
                        'consensus_score': date_scores['consensus_score'],
                        'lstm_detected': date_scores['lstm_detected'],
                        'pyod_detected': date_scores['pyod_detected'],
                        'consensus_detected': date_scores['consensus_detected'],
                        'explanation': explanation_text,
                        'recommendation': generate_recommendation(severity, z_score)
                    }

                    explanations.append(explanation)

    explanation_df = pd.DataFrame(explanations)

    if not explanation_df.empty:
        explanation_df = explanation_df.sort_values('feature_contribution_score', ascending=False)
        explanation_df = explanation_df.reset_index(drop=True)

    print(f"Created explanations for {len(explanation_df)} features")
    return explanation_df

def determine_severity(z_score: float, deviation_pct: float) -> str:
    """Determine severity level based on statistical measures."""

    abs_z = abs(z_score)
    abs_dev = abs(deviation_pct)

    if abs_z >= 5 or abs_dev >= 1000:
        return 'CRITICAL'
    elif abs_z >= 3 or abs_dev >= 500:
        return 'HIGH'
    elif abs_z >= 2 or abs_dev >= 100:
        return 'MEDIUM'
    else:
        return 'LOW'

def extract_base_column(feature_name: str) -> str:
    """Extract base column name (e.g., 'aimp' from 'aimp_mean')."""
    return feature_name.split('_')[0] if '_' in feature_name else feature_name

def extract_metric_type(feature_name: str) -> str:
    """Extract metric type (e.g., 'mean' from 'aimp_mean')."""
    parts = feature_name.split('_')
    return parts[-1] if len(parts) > 1 else 'base_value'

def generate_feature_explanation(
    feature: str,
    current_value: float,
    hist_mean: float,
    z_score: float,
    deviation_pct: float,
    severity: str
) -> str:
    """Generate human-readable explanation for the feature."""

    direction = "increased" if current_value > hist_mean else "decreased"
    magnitude = get_magnitude_word(abs(z_score))

    return (f"Feature '{feature}' has {magnitude} {direction} from its historical average. "
            f"Current value: {current_value:.4f}, Historical average: {hist_mean:.4f}. "
            f"This represents a {abs(deviation_pct):.1f}% change with z-score of {z_score:.2f}. "
            f"Severity: {severity}")

def get_magnitude_word(abs_z_score: float) -> str:
    """Get magnitude description based on z-score."""
    if abs_z_score >= 5:
        return "extremely"
    elif abs_z_score >= 3:
        return "significantly"
    elif abs_z_score >= 2:
        return "moderately"
    else:
        return "slightly"

def generate_recommendation(severity: str, z_score: float) -> str:
    """Generate actionable recommendation."""

    if severity == 'CRITICAL':
        return "IMMEDIATE ACTION: Investigate data sources and business processes immediately"
    elif severity == 'HIGH':
        return "URGENT: Review within 24 hours to identify root cause"
    elif severity == 'MEDIUM':
        return "MONITOR: Schedule review within 48-72 hours"
    else:
        return "TRACK: Continue monitoring in regular review cycle"

def create_summary_report(lstm_pyod_results: Dict[str, Any], target_date: str) -> Dict[str, Any]:
    """Create executive summary report."""

    if target_date not in lstm_pyod_results.get('anomaly_scores', {}):
        return {}

    scores = lstm_pyod_results['anomaly_scores'][target_date]
    severity = lstm_pyod_results['anomaly_severity'].get(target_date, 'LOW')

    both_agree = scores['lstm_detected'] and scores['pyod_detected']
    any_detected = scores['lstm_detected'] or scores['pyod_detected']

    summary = {
        'detection_date': target_date,
        'overall_status': 'ANOMALY' if scores['consensus_detected'] else 'NORMAL',
        'severity_level': severity,
        'confidence_level': 'HIGH' if both_agree else 'MEDIUM' if any_detected else 'LOW',
        'lstm_score': round(scores['lstm_score'], 3),
        'pyod_score': round(scores['pyod_score'], 3),
        'consensus_score': round(scores['consensus_score'], 3),
        'lstm_detected': scores['lstm_detected'],
        'pyod_detected': scores['pyod_detected'],
        'consensus_detected': scores['consensus_detected'],
        'algorithm_agreement': both_agree,
        'thresholds_used': lstm_pyod_results['summary_stats']['thresholds_used'],
        'recommendation': generate_recommendation(severity, 0)
    }

    return summary

def log_detection_scores(lstm_pyod_results: Dict[str, Any], target_date: str):
    """Log detection scores for audit trail."""

    if target_date not in lstm_pyod_results.get('anomaly_scores', {}):
        print(f"No scores found for {target_date}")
        return

    scores = lstm_pyod_results['anomaly_scores'][target_date]
    thresholds = lstm_pyod_results['summary_stats']['thresholds_used']

    print(f"\nDETECTION SCORE LOG for {target_date}")
    print("=" * 50)
    print(f" LSTM Score: {scores['lstm_score']:.4f}")
    print(f"   Threshold: {thresholds['lstm_threshold']}")
    print(f"   Detected: {scores['lstm_detected']}")
    print(f"PyOD Score: {scores['pyod_score']:.4f}")
    print(f"   Threshold: {thresholds['pyod_threshold']}")
    print(f"   Detected: {scores['pyod_detected']}")
    print(f" Consensus Score: {scores['consensus_score']:.4f}")
    print(f"   Threshold: {thresholds['consensus_threshold']}")
    print(f"   Final Decision: {'ANOMALY' if scores['consensus_detected'] else 'NORMAL'}")
    print(f" Severity: {lstm_pyod_results['anomaly_severity'].get(target_date, 'LOW')}")
    print("=" * 50)

**Explainable DataFrame execution**

In [None]:
explanation_df = create_anomaly_explanation_dataframe(
    lstm_pyod_results=lstm_pyod_results,
    original_df=transformed_datatype_df,  # original dataframe
    target_date='2023-02-27',
    target_columns=['aimp', 'amud', 'arnd']
)

Creating explainability for 2023-02-27...
Historical data: 57 days
Target date data: 2023-02-27
Analyzing 200 features
Created explanations for 200 features


In [None]:
explanation_df.columns

Index(['date', 'feature_name', 'base_column', 'metric_type', 'current_value',
       'historical_mean', 'historical_std', 'historical_min', 'historical_max',
       'z_score', 'deviation_percentage', 'percentile_rank', 'severity',
       'feature_contribution_score', 'lstm_score', 'pyod_score',
       'consensus_score', 'lstm_detected', 'pyod_detected',
       'consensus_detected', 'explanation', 'recommendation'],
      dtype='object')

**Summary Report**

In [None]:
summary = create_summary_report(
    lstm_pyod_results=lstm_pyod_results,
    target_date='2023-02-27'
)

print("📋 SUMMARY REPORT:")
for key, value in summary.items():
    print(f"   {key}: {value}")

📋 SUMMARY REPORT:
   detection_date: 2023-02-27
   overall_status: NORMAL
   severity_level: HIGH
   confidence_level: MEDIUM
   lstm_score: 0.93
   pyod_score: 1.0
   consensus_score: 0.958
   lstm_detected: False
   pyod_detected: True
   consensus_detected: False
   algorithm_agreement: False
   thresholds_used: {'lstm_threshold': 1.0, 'pyod_threshold': 0.7, 'consensus_threshold': 0.75}
   recommendation: URGENT: Review within 24 hours to identify root cause


**Log Detection Scores**

In [None]:
log_detection_scores(
    lstm_pyod_results=lstm_pyod_results,
    target_date='2023-02-27'
)


DETECTION SCORE LOG for 2023-02-27
 LSTM Score: 0.9300
   Threshold: 1.0
   Detected: False
PyOD Score: 1.0000
   Threshold: 0.7
   Detected: True
 Consensus Score: 0.9580
   Threshold: 0.75
   Final Decision: NORMAL
 Severity: HIGH


**Critical feature contributed**

In [None]:
# Share only HIGH/CRITICAL severity features
critical_features = explanation_df[explanation_df['severity'].isin(['HIGH', 'CRITICAL'])]
critical_features.to_csv('/content/sample_data/critical_features.csv', index=False)

In [None]:
critical_features.head(5)

Unnamed: 0,date,feature_name,base_column,metric_type,current_value,historical_mean,historical_std,historical_min,historical_max,z_score,...,severity,feature_contribution_score,lstm_score,pyod_score,consensus_score,lstm_detected,pyod_detected,consensus_detected,explanation,recommendation
0,2023-02-27,aimp_UniqueValueRatio_anomaly_30d,aimp,30d,1.0,0.0,0.0,0.0,0.0,100000000.0,...,CRITICAL,10000000.0,0.929998,1.0,0.957999,False,True,False,Feature 'aimp_UniqueValueRatio_anomaly_30d' ha...,IMMEDIATE ACTION: Investigate data sources and...
1,2023-02-27,arnd_CountDistinct_rolling_30d_std,arnd,std,1997.328515,119.766662,123.929667,0.0,269.597829,15.15,...,CRITICAL,1.515,0.929998,1.0,0.957999,False,True,False,Feature 'arnd_CountDistinct_rolling_30d_std' h...,IMMEDIATE ACTION: Investigate data sources and...
2,2023-02-27,arnd_CountDistinct_zscore_30d,arnd,30d,-5.267085,0.011426,0.478672,-1.231919,1.753079,-11.027,...,CRITICAL,1.103,0.929998,1.0,0.957999,False,True,False,Feature 'arnd_CountDistinct_zscore_30d' has ex...,IMMEDIATE ACTION: Investigate data sources and...
3,2023-02-27,aimp_Sum_zscore_30d,aimp,30d,-3.878286,-0.074981,0.635234,-2.843448,1.210494,-5.987,...,CRITICAL,0.599,0.929998,1.0,0.957999,False,True,False,Feature 'aimp_Sum_zscore_30d' has extremely de...,IMMEDIATE ACTION: Investigate data sources and...
4,2023-02-27,aimp_Sum_diff_7d,aimp,7d,-126.714286,0.52381,24.031781,-60.142857,57.857143,-5.295,...,CRITICAL,0.529,0.929998,1.0,0.957999,False,True,False,Feature 'aimp_Sum_diff_7d' has extremely decre...,IMMEDIATE ACTION: Investigate data sources and...


**controller**

# App team just chooses sensitivity level
updated_results = controller.apply_sensitivity_control(
    lstm_pyod_results=lstm_pyod_results,
    sensitivity_level='low',  # 'very_high', 'high', 'medium', 'low', 'very_low'
    target_date='2023-02-27'
)
"""
# ================================================
# QUICK REFERENCE FOR USERS:
# ================================================


WHEN TO USE EACH OPTION:

Option 1 - Sensitivity Levels:
- Best for: Non-technical users who want simple control
- Parameters: 'very_high', 'high', 'medium', 'low', 'very_low'
- Use case: "I want less false alarms" → use 'low' or 'very_low'

Option 2 - Percentage Tolerance:
- Best for: Users who want precise control
- Parameters: 0-100% (higher = less sensitive)
- Use case: "I want 80% confidence threshold" → use tolerance_percent=80

EXAMPLES:
1. For reducing false positives: sensitivity_level='low' or tolerance_percent=85
2. For catching more anomalies: sensitivity_level='high' or tolerance_percent=30
3. For balanced detection: sensitivity_level='medium' or tolerance_percent=50


**For String-based tolerance:**

In [None]:
controller = SimpleAnomalyController()
updated_results = controller.apply_sensitivity_control(
    lstm_pyod_results=lstm_pyod_results,
    sensitivity_level='very_low',  # 'very_high', 'high', 'medium', 'low', 'very_low'
    target_date='2023-02-27'
)

**For percentage-based tolerance:**

In [None]:
controller = PercentageAnomalyController()  # Different controller
updated_results = controller.apply_tolerance_percentage(
    lstm_pyod_results=lstm_pyod_results,
    tolerance_percent=99,  # 0-100%, higher = less sensitive
    target_date='2023-02-27'
)