In [1]:
import numpy as np

def detect_outliers_mad(data, threshold=3.5):
    """
    Identifies outliers in a dataset using the Median Absolute Deviation (MAD) method.

    Args:
        data (array-like): A list or numpy array of numerical data
                           (e.g., transaction amounts).
        threshold (float): The modified Z-score threshold to classify outliers.
                           Points with abs(modified Z-score) > threshold are
                           considered outliers. Common values are 3.0 or 3.5.
                           Defaults to 3.5.

    Returns:
        tuple: A tuple containing:
            - outliers (np.ndarray): An array of the data points identified as outliers.
            - median (float): The median of the input data.
            - mad (float): The Median Absolute Deviation of the input data.
            - modified_z_scores (np.ndarray or None): The calculated modified Z-scores
                                                      for each data point, or None if MAD is zero.
    """
    # Ensure input is a numpy array for efficient calculations
    data = np.asarray(data)

    # Handle empty data gracefully
    if data.size == 0:
        return np.array([]), np.nan, np.nan, np.array([])

    # 1. Calculate the median of the data
    median = np.median(data)

    # 2. Calculate the absolute deviations from the median
    # Use np.abs for element-wise absolute value
    abs_deviations = np.abs(data - median)

    # 3. Calculate the Median Absolute Deviation (MAD)
    mad = np.median(abs_deviations)

    # 4. Handle the edge case where MAD is zero (e.g., >50% data points are identical)
    if mad == 0:
        print("Warning: MAD is zero. Cannot calculate modified Z-scores.")
        # Option 1: Return no outliers based on Z-score method
        # return np.array([]), median, mad, None
        # Option 2: Consider any point not equal to the median as an outlier
        outliers = data[data != median]
        return outliers, median, mad, None


    # 5. Calculate the Modified Z-scores
    # The constant 0.6745 scales MAD to be comparable to standard deviation
    # for normally distributed data.
    modified_z_scores = 0.6745 * abs_deviations / mad

    # 6. Identify outliers: points where the absolute modified Z-score > threshold
    outlier_mask = np.abs(modified_z_scores) > threshold
    outliers = data[outlier_mask]

    return outliers, median, mad, modified_z_scores

# --- Example Usage ---

# Sample financial transaction amounts ($)
# Includes typical amounts and a few potential outliers (very high or low)
transaction_amounts = np.array([
    55.20, 68.15, 75.00, 88.90, 105.50, 62.30, 79.80, 95.10,
    110.00, 125.60, 4500.00, # <-- Likely high outlier
    72.50, 85.40, 99.99, 108.20, 5.50,    # <-- Likely low outlier
    91.75, 115.30, 66.80, 81.00, 103.40, 5200.50, # <-- Another high outlier
    78.60, 93.25
])

print("Original Transaction Amounts:")
print(transaction_amounts)
print("-" * 40)

# Set the threshold for outlier detection
# A threshold of 3.5 is often used, meaning points more than 3.5
# median absolute deviations away from the median are flagged.
mad_threshold = 3.5
print(f"Using MAD threshold: {mad_threshold}")
print("-" * 40)

# Detect outliers using the function
identified_outliers, data_median, data_mad, mod_z_scores = detect_outliers_mad(
    transaction_amounts,
    threshold=mad_threshold
)

# Print the results
print(f"Data Median: ${data_median:.2f}")
print(f"Data MAD: ${data_mad:.2f}")

if mod_z_scores is not None:
    print("\nModified Z-scores (absolute values for context):")
    # Optional: print scores, maybe only for those exceeding a lower bound like 1.0
    relevant_indices = np.where(np.abs(mod_z_scores) > 1.0)[0]
    if len(relevant_indices) > 0:
         for i in relevant_indices:
             print(f"  Transaction ${transaction_amounts[i]:.2f}: Abs Mod Z-score = {np.abs(mod_z_scores[i]):.2f}")
    else:
        print("  (No scores significantly deviated from the median)")


print("-" * 40)
if identified_outliers.size > 0:
    print(f"Identified Outliers (abs(Mod Z-score) > {mad_threshold}):")
    print(f"$ {identified_outliers}")
else:
    print(f"No outliers identified with the threshold of {mad_threshold}.")

print("\n--- Example with Zero MAD ---")
# Data where MAD will be zero
identical_data = np.array([200.0, 200.0, 200.0, 200.0, 200.0])
print("Data:", identical_data)
outliers_zero, median_zero, mad_zero, _ = detect_outliers_mad(identical_data, threshold=mad_threshold)
print(f"Data Median: ${median_zero:.2f}")
print(f"Data MAD: ${mad_zero:.2f}")
if outliers_zero.size > 0:
    print("Identified Outliers (non-median values when MAD is zero):")
    print(f"$ {outliers_zero}")
else:
    print("No outliers identified (as expected when MAD is zero).")

Original Transaction Amounts:
[  55.2    68.15   75.     88.9   105.5    62.3    79.8    95.1   110.
  125.6  4500.     72.5    85.4    99.99  108.2     5.5    91.75  115.3
   66.8    81.    103.4  5200.5    78.6    93.25]
----------------------------------------
Using MAD threshold: 3.5
----------------------------------------
Data Median: $90.33
Data MAD: $16.58

Modified Z-scores (absolute values for context):
  Transaction $55.20: Abs Mod Z-score = 1.43
  Transaction $62.30: Abs Mod Z-score = 1.14
  Transaction $125.60: Abs Mod Z-score = 1.44
  Transaction $4500.00: Abs Mod Z-score = 179.45
  Transaction $5.50: Abs Mod Z-score = 3.45
  Transaction $115.30: Abs Mod Z-score = 1.02
  Transaction $5200.50: Abs Mod Z-score = 207.95
----------------------------------------
Identified Outliers (abs(Mod Z-score) > 3.5):
$ [4500.  5200.5]

--- Example with Zero MAD ---
Data: [200. 200. 200. 200. 200.]
Data Median: $200.00
Data MAD: $0.00
No outliers identified (as expected when MAD is zero)

In [10]:
def detect_outliers_mad(data, threshold=3.5):
    """
    Identifies outliers in a dataset using the Median Absolute Deviation (MAD) method.

    Args:
        data (array-like): A list or numpy array of numerical data
                           (e.g., transaction amounts).
        threshold (float): The modified Z-score threshold to classify outliers.
                           Points with abs(modified Z-score) > threshold are
                           considered outliers. Common values are 3.0 or 3.5.
                           Defaults to 3.5.

    Returns:
        tuple: A tuple containing:
            - outliers (np.ndarray): An array of the data points identified as outliers.
            - median (float): The median of the input data.
            - mad (float): The Median Absolute Deviation of the input data.
            - modified_z_scores (np.ndarray or None): The calculated modified Z-scores
                                                      for each data point, or None if MAD is zero.
    """
# Ensure input is a numpy array for efficient calculations
    data = np.asarray(data)

    # Handle empty data gracefully
    if data.size == 0:
        return np.array([]), np.nan, np.nan, np.array([])

    # 1. Calculate the median of the data
    median = np.median(data)

    # 2. Calculate the absolute deviations from the median
    # Use np.abs for element-wise absolute value
    abs_deviations = np.abs(data - median)

    # 3. Calculate the Median Absolute Deviation (MAD)
    mad = np.median(abs_deviations)

    # 4. Handle the edge case where MAD is zero (e.g., >50% data points are identical)
    if mad == 0:
        print("Warning: MAD is zero. Cannot calculate modified Z-scores.")
        # Option 1: Return no outliers based on Z-score method
        # return np.array([]), median, mad, None
        # Option 2: Consider any point not equal to the median as an outlier
        outliers = data[data != median]
        return outliers, median, mad, None


    # 5. Calculate the Modified Z-scores
    # The constant 0.6745 scales MAD to be comparable to standard deviation
    # for normally distributed data.
    modified_z_scores = 0.6745 * abs_deviations / mad

    # 6. Identify outliers: points where the absolute modified Z-score > threshold
    outlier_mask = np.abs(modified_z_scores) > threshold
    outliers = data[outlier_mask]

    return outliers, median, mad, modified_z_scores

In [12]:

# Sample financial transaction amounts ($)
# Includes typical amounts and a few potential outliers (very high or low)
transaction_amounts = np.array([
    55.20, 68.15, 75.00, 88.90, 105.50, 62.30, 79.80, 95.10,
    110.00, 125.60, 4500.00, # <-- Likely high outlier
    72.50, 85.40, 99.99, 108.20, 5.50,    # <-- Likely low outlier
    91.75, 115.30, 66.80, 81.00, 103.40, 5200.50, # <-- Another high outlier
    78.60, 93.25
])

print("Original Transaction Amounts:")
print(transaction_amounts)
print("-" * 40)

# Set the threshold for outlier detection
# A threshold of 3.5 is often used, meaning points more than 3.5
# median absolute deviations away from the median are flagged.
mad_threshold = 3.5
print(f"Using MAD threshold: {mad_threshold}")
print("-" * 40)

# Detect outliers using the function
identified_outliers, data_median, data_mad, mod_z_scores = detect_outliers_mad(
    transaction_amounts,
    threshold=mad_threshold
)

print(f"Data Median: ${data_median:.2f}")
print(f"Data MAD: ${data_mad:.2f}")
print(identified_outliers)

Original Transaction Amounts:
[  55.2    68.15   75.     88.9   105.5    62.3    79.8    95.1   110.
  125.6  4500.     72.5    85.4    99.99  108.2     5.5    91.75  115.3
   66.8    81.    103.4  5200.5    78.6    93.25]
----------------------------------------
Using MAD threshold: 3.5
----------------------------------------
Data Median: $90.33
Data MAD: $16.58
[4500.  5200.5]
