In [None]:
import numpy as np
import pandas as pd
from scipy import stats


window_size = 1000
half_window = window_size // 2  # Half window for sliding

# These will hold the anomaly label (0/1) and tail‐probability for each index
anomaly_labels = np.full(len(df), np.nan)
tail_probs     = np.full(len(df), np.nan)

def process_window(data_window: pd.Series, idx_range: range):
    """
    Given a pandas Series `data_window` and the corresponding indices in the original DataFrame
    (idx_range), fit a Generalized Pareto to the upper 5% of values in data_window, then
    compute tail‐probabilities for every point in data_window. Assign label=1 when prob<0.5.
    """
    if data_window.empty:
        return

    # Compute the 95th percentile threshold
    threshold = np.percentile(data_window, 95)

    # Extract peaks above threshold for GPD fitting
    peaks = data_window[data_window > threshold]
    if peaks.empty:
        # If no peaks above threshold, skip assignment
        return

    # Fit GPD to the peaks
    shape, loc, scale = stats.genpareto.fit(peaks)

    # Compute tail‐probabilities for all points in the window
    probs = 1.0 - stats.genpareto.cdf(data_window, c=shape, loc=loc, scale=scale)

    # Label as anomaly if tail-prob < 0.5
    labels = (probs < 0.5).astype(int)

    # Assign back to the global arrays
    for i, original_idx in enumerate(idx_range):
        if 0 <= original_idx < len(anomaly_labels):
            anomaly_labels[original_idx] = labels[i]
            tail_probs[original_idx]     = probs[i]


# Beginning segment: indices [0, half_window)
start_idx = 0
end_idx   = half_window
chunk = df['loss'].iloc[start_idx:end_idx]
process_window(chunk, range(start_idx, end_idx))

# Ending segment: indices [len(df) - half_window, len(df))
start_idx = len(df) - half_window
end_idx   = len(df)
chunk = df['loss'].iloc[start_idx:end_idx]
process_window(chunk, range(start_idx, end_idx))


# We slide in steps of half_window so that windows overlap by half.
for start in range(0, len(df) - window_size + 1, half_window):
    window_slice = df['loss'].iloc[start : start + window_size]
    assign_range = range(start + half_window, start + window_size)
    process_window(window_slice, assign_range)

results_df = pd.DataFrame({
    'label': anomaly_labels,
    'prob':  tail_probs
})

results_df.to_csv('/home/anomaly.csv', index=False)
