## Name: Anomaly Detection using PCA on the features
### Date: 18/7/2024
### Status: In Progress.
### Idea: 
Use PCA on X, transform back, highlight as anomalies the top-K with most distance from their reconstrunction?


### Results:


In [93]:
import numpy as np
import pandas as pd
data = np.load('./2_annthyroid.npz', allow_pickle=True)
#data = np.load('./5_campaign(1).npz', allow_pickle=True)
X, y = data['X'], data['y']
X.shape, pd.Series(y).value_counts()

((7200, 6),
 0    6666
 1     534
 Name: count, dtype: int64)

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, test_size=0.2, random_state=42)

In [96]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report,f1_score,average_precision_score
clf = IsolationForest(random_state=42)
clf.fit(X_train)
y_pred = clf.predict(X_test)
y_pred_proba = -clf.decision_function(X_test)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
print(classification_report(y_test, y_pred))
print(f"F1 pos: {f1_score(y_test, y_pred, average='binary'):.4f}")
print(f"AP: {average_precision_score(y_test, y_pred_proba):.4f}")


              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1333
           1       0.36      0.41      0.38       107

    accuracy                           0.90      1440
   macro avg       0.65      0.68      0.66      1440
weighted avg       0.91      0.90      0.90      1440

F1 pos: 0.3826
AP: 0.3831


In [94]:
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted
from sklearn.base import ClassifierMixin
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

class ReconstructionAnomalyDetector(ClassifierMixin):
    """
    Anomaly detection using reconstruction error from a matrix decomposition model.

    This detector trains a matrix decomposition model (like PCA, SVD, or NMF)
    on normal data. It then identifies anomalies by measuring the
    reconstruction error of new data points. Points with a high reconstruction
    error are considered anomalous.

    Args:
        model (str): The decomposition model to use ('pca', 'svd', or 'nmf').
        n_components (int): The number of components for the decomposition model.
        contamination (float or 'auto'): The expected proportion of anomalies.
                           If 'auto', the threshold is determined automatically
                           using the 3-sigma rule on the training errors.
    """
    def __init__(self, model='pca', n_components=2, contamination=0.05):
        if model not in ['pca', 'svd', 'nmf']:
            raise ValueError("Model must be one of 'pca', 'svd', or 'nmf'")
        self.model_name = model
        self.n_components = n_components
        self.contamination = contamination
        if model == 'nmf':
            self.scaler = MaxAbsScaler()
        else:
            self.scaler = StandardScaler()
        self.threshold_ = None

        if self.model_name == 'pca':
            self.model = PCA(n_components=self.n_components)
        elif self.model_name == 'svd':
            self.model = TruncatedSVD(n_components=self.n_components)
        elif self.model_name == 'nmf':
            self.model = NMF(n_components=self.n_components, init='random', random_state=0, max_iter=1000)

    def _calculate_reconstruction_error(self, X):
        """Calculates the reconstruction error for each sample in X."""
        check_is_fitted(self.model)
        X_transformed = self.model.transform(X)
        X_reconstructed = self.model.inverse_transform(X_transformed)
        return np.linalg.norm(X - X_reconstructed, axis=1)

    def fit(self, X_train, y_train  = None):
        """
        Fit the anomaly detector to the training data.

        This involves training the decomposition model and setting the anomaly
        threshold based on the reconstruction errors of the training data.

        Args:
            X_train (np.ndarray): The training data (samples x features).
        """
        X_scaled = self.scaler.fit_transform(X_train)
        self.model.fit(X_scaled)
        
        train_errors = self._calculate_reconstruction_error(X_scaled)
        
        if self.contamination == 'auto':
            # Use the 3-sigma rule to set the threshold, a common statistical
            # approach for outlier detection.
            self.threshold_ = np.mean(train_errors) + 3 * np.std(train_errors)
        else:
            # Set the threshold based on the contamination parameter
            self.threshold_ = np.quantile(train_errors, 1 - self.contamination)

        return self

    def predict(self, X):
        """
        Predict whether samples in X are anomalies.

        Args:
            X (np.ndarray): The data to predict on.

        Returns:
            np.ndarray: An array of 0s (normal) and 1s (anomaly).
        """
        if self.threshold_ is None:
            raise RuntimeError("The model must be fitted before prediction.")

        
        errors = self.decision_function(X)
        # Return 1 for anomalies (error > threshold), 0 for normal
        return (errors > self.threshold_).astype(int)
    
    def decision_function(self, X):
        X_scaled = self.scaler.transform(X)
        errors = self._calculate_reconstruction_error(X_scaled)
        return errors
    
res = []
for t in ['pca', 'nmf', 'svd']:
    for n_comp in [1, 2,3,4,5, 6]:
        clf = ReconstructionAnomalyDetector(model=t, n_components=n_comp, contamination='auto')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='binary')
        # print(classification_report(y_test, y_pred))
        # print(f1)
        res.append((t, n_comp, f1))
res_df = pd.DataFrame(res, columns=['model', 'num_d', 'f1'])
res_df = res_df.sort_values('f1', ascending=False)
res_df



Unnamed: 0,model,num_d,f1
9,nmf,4,0.229508
8,nmf,3,0.208
7,nmf,2,0.198473
13,svd,2,0.133333
1,pca,2,0.133333
2,pca,3,0.131148
14,svd,3,0.131148
12,svd,1,0.118644
0,pca,1,0.118644
5,pca,6,0.114754


In [90]:
import numpy as np
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.utils.validation import check_is_fitted
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.utils.random import sample_without_replacement

class PatchBasedAnomalyDetector:
    """
    An ensemble anomaly detector that trains multiple PCA models on random
    patches (subsets) of features. The final anomaly score is the average
    reconstruction error across all models.
    """
    def __init__(self, n_patches=10, patch_size=0.5, n_components=2, contamination=0.05, model='pca'):
        self.n_patches = n_patches
        self.patch_size = patch_size
        self.n_components = n_components
        self.contamination = contamination
        self.model = model
        self.models = []
        self.feature_indices = []
        self.threshold_ = None

    def fit(self, X_train):
        n_samples, n_features = X_train.shape
        
        if isinstance(self.patch_size, float):
            k = int(n_features * self.patch_size)
        else:
            k = self.patch_size
        
        if k < self.n_components:
            raise ValueError(f"patch_size ({k}) is smaller than n_components ({self.n_components}).")

        for _ in range(self.n_patches):
            indices = sample_without_replacement(n_features, k, random_state=_) # for reproducibility
            self.feature_indices.append(indices)

            detector = ReconstructionAnomalyDetector(model=self.model, n_components=self.n_components, contamination=self.contamination)
            detector.fit(X_train[:, indices])
            self.models.append(detector)
        
        train_scores = self.decision_function(X_train)
        
        if self.contamination == 'auto':
            # Use the 3-sigma rule to set the threshold, a common statistical
            # approach for outlier detection.
            self.threshold_ = np.mean(train_scores) + 2 * np.std(train_scores)
        else:
            # Set the threshold based on the contamination parameter
            self.threshold_ = np.quantile(train_scores, 1 - self.contamination)
        return self

    def decision_function(self, X):
        all_errors = np.zeros((X.shape[0], self.n_patches))
        for i, (model, indices) in enumerate(zip(self.models, self.feature_indices)):
            all_errors[:, i] = model.decision_function(X[:, indices])
        return np.mean(all_errors, axis=1)

    def predict(self, X):
        scores = self.decision_function(X)
        return (scores > self.threshold_).astype(int)


class LocalPCADetector:
    """
    Applies PCA locally by first clustering the data and then fitting a separate
    PCA model to each cluster. Anomaly scores are calculated using the PCA model
    of the closest cluster.
    """
    def __init__(self, n_clusters=5, n_components=2, contamination=0.05, model='pca'):
        self.n_clusters = n_clusters
        self.n_components = n_components
        self.contamination = contamination
        self.model = model
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
        self.pca_models = {}
        self.threshold_ = None

    def fit(self, X_train):
        labels = self.kmeans.fit_predict(X_train)
        
        all_train_errors = []
        for i in range(self.n_clusters):
            cluster_data = X_train[labels == i]
            if len(cluster_data) <= self.n_components:
                # Not enough data in cluster, skip
                continue
            
            detector = ReconstructionAnomalyDetector(model=self.model, n_components=self.n_components, contamination=self.contamination)
            detector.fit(cluster_data)
            self.pca_models[i] = detector
            
            # To set a global threshold, we calculate errors for all training points
            # on their assigned cluster's model
            cluster_errors = self.decision_function(cluster_data, precomputed_labels=np.full(len(cluster_data), i))
            all_train_errors.append(cluster_errors)

        if not all_train_errors:
             raise RuntimeError("Could not fit any local PCA models. Try reducing n_clusters or providing more data.")

        all_train_errors = np.concatenate(all_train_errors)
        if self.contamination == 'auto':
            # Use the 3-sigma rule to set the threshold, a common statistical
            # approach for outlier detection.
            self.threshold_ = np.mean(all_train_errors) + 2 * np.std(all_train_errors)
        else:
            # Set the threshold based on the contamination parameter
            self.threshold_ = np.quantile(all_train_errors, 1 - self.contamination)
        return self

    def decision_function(self, X, precomputed_labels=None):
        if precomputed_labels is None:
            labels = self.kmeans.predict(X)
        else:
            labels = precomputed_labels
            
        errors = np.zeros(X.shape[0])
        for i in range(self.n_clusters):
            if i in self.pca_models:
                mask = (labels == i)
                if np.any(mask):
                    errors[mask] = self.pca_models[i].decision_function(X[mask])
        return errors

    def predict(self, X):
        scores = self.decision_function(X)
        return (scores > self.threshold_).astype(int)

In [93]:
# --- 3. Using Patch-Based PCA ---
print("--- 3. Patch-Based PCA ---")
detector_patch = PatchBasedAnomalyDetector(n_patches=10, patch_size=4, n_components=4, contamination='auto', model='svd')
detector_patch.fit(X_train)
predictions_patch = detector_patch.predict(X_test)
print(classification_report(y_test, predictions_patch, target_names=['normal', 'anomaly']))

# --- 4. Using Local PCA (via Clustering) ---
print("--- 4. Local PCA (via Clustering) ---")
detector_local = LocalPCADetector(n_clusters=10, n_components=6, contamination='auto', model='svd')
detector_local.fit(X_train)
predictions_local = detector_local.predict(X_test)
print(classification_report(y_test, predictions_local, target_names=['normal', 'anomaly']))

--- 3. Patch-Based PCA ---
              precision    recall  f1-score   support

      normal       0.94      0.99      0.96      1333
     anomaly       0.55      0.17      0.26       107

    accuracy                           0.93      1440
   macro avg       0.74      0.58      0.61      1440
weighted avg       0.91      0.93      0.91      1440

--- 4. Local PCA (via Clustering) ---
              precision    recall  f1-score   support

      normal       0.94      0.98      0.96      1333
     anomaly       0.44      0.22      0.30       107

    accuracy                           0.92      1440
   macro avg       0.69      0.60      0.63      1440
weighted avg       0.90      0.92      0.91      1440

