In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils.data_utils import prepare_dataset, extract_training_split_from_filename, extract_anomaly_range_from_filename, create_file_mapping

def visualize_dataset_eda(file_path):
    """
    Visualizes the time series dataset with highlighted training and anomaly regions for EDA.
    
    Args:
        file_path (str): The path to the dataset file.
    """
    data = prepare_dataset(file_path)
    training_split = extract_training_split_from_filename(os.path.basename(file_path))
    anomaly_range = extract_anomaly_range_from_filename(os.path.basename(file_path))
    anomaly_start, anomaly_end = anomaly_range

    plt.figure(figsize=(18, 7))
    plt.plot(data["Value"], label="Time Series Data", color="navy")

    plt.axvspan(0, training_split, color="lightgreen", alpha=0.4, label=f"Training Data (0 to {training_split})")
    plt.axvspan(anomaly_start, anomaly_end, color="salmon", alpha=0.6, label=f"Anomaly ({anomaly_start} to {anomaly_end})")

    plt.xlabel("Time Index")
    plt.ylabel("Value")
    plt.title(f"Time Series Visualization: {os.path.basename(file_path)}")
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()

    zoom_padding = (anomaly_end - anomaly_start) * 2
    zoom_start_padded = max(0, int(anomaly_start - zoom_padding))
    zoom_end_padded = min(len(data), int(anomaly_end + zoom_padding))
    
    plt.figure(figsize=(14, 6))
    plt.plot(range(zoom_start_padded, zoom_end_padded), data["Value"].iloc[zoom_start_padded:zoom_end_padded], label="Time Series Data", color="navy", marker='o', markersize=3, linestyle='-')
    plt.axvspan(anomaly_start, anomaly_end, color="salmon", alpha=0.9, label="Ground Truth Anomaly")
    
    plt.xlabel("Time Index")
    plt.ylabel("Value")
    plt.title(f"Zoomed-in Anomaly Region: {os.path.basename(file_path)}")
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()


data_folder = 'data'
file_mapping = create_file_mapping(data_folder)

if file_mapping:
    first_dataset_key = sorted(file_mapping.keys())[0]
    print(f"Visualizing dataset for key: {first_dataset_key} - {file_mapping[first_dataset_key]}")
    visualize_dataset_eda(file_mapping[first_dataset_key])
else:
    print(f"No .txt files found in {data_folder}. Please check the path and ensure data files are present.")


In [None]:
for key in sorted(file_mapping.keys())[:10]:
    print(f"Visualizing dataset for key: {key}")
    visualize_dataset_eda(file_mapping[key])


In [None]:
def visualize_detector_output(data, anomaly_scores, detected_anomaly_indices, anomaly_range, detector_name="Detector"):
    """
    Visualizes the time series with anomaly scores and detected anomaly points.
    """
    anomaly_start, anomaly_end = anomaly_range
    
    fig, ax1 = plt.subplots(figsize=(18, 7))

    color = 'navy'
    ax1.set_xlabel('Time Index')
    ax1.set_ylabel('Value', color=color)
    ax1.plot(data.index, data["Value"], color=color, label='Time Series Data')
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.axvspan(anomaly_start, anomaly_end, color="salmon", alpha=0.5, label="Ground Truth Anomaly")
    
    if detected_anomaly_indices is not None and len(detected_anomaly_indices) > 0:
        valid_indices = [idx for idx in detected_anomaly_indices if idx in data.index]
        if valid_indices:
            ax1.plot(data.loc[valid_indices].index, data["Value"].loc[valid_indices], 'ro', markersize=6, label=f'{detector_name} Detected Anomalies')

    ax2 = ax1.twinx()
    color = 'darkorange'
    ax2.set_ylabel('Anomaly Score', color=color)

    ax2.plot(data.index, anomaly_scores, color=color, alpha=0.7, label=f'{detector_name} Anomaly Scores')
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()
    plt.title(f'{detector_name} Output Visualization')
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()


In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.models.traditional.local_outlier_factor import LOFDetector
from src.utils.data_utils import prepare_dataset, extract_training_split_from_filename, extract_anomaly_range_from_filename

print("Visualizing LOF Detector Output for the first 5 datasets...")

if 'file_mapping' not in globals() or not file_mapping:
    print("Error: file_mapping not found or is empty. Please run the cell that creates file_mapping first.")
elif 'visualize_detector_output' not in globals():
    print("Error: visualize_detector_output function not defined. Please run the cell where it's defined.")
else:
    lof_detector = LOFDetector() 
    threshold_value = 4

    for i, key in enumerate(sorted(file_mapping.keys())[:5]):
        file_path = file_mapping[key]
        dataset_basename = os.path.basename(file_path)
        print(f"\nProcessing dataset {i+1}/5: {dataset_basename} (key: {key})")

        try:
            data_df = prepare_dataset(file_path)
            if data_df.empty:
                print(f"  Skipping {dataset_basename}, data is empty.")
                continue

            training_split_idx = extract_training_split_from_filename(dataset_basename)
            anomaly_range = extract_anomaly_range_from_filename(dataset_basename)

            train_values = data_df['Value'].iloc[:training_split_idx].values.reshape(-1, 1)
            test_values = data_df['Value'].iloc[training_split_idx:].values.reshape(-1, 1)

            padded_scores = np.full(len(data_df), np.nan)
            all_detected_anomaly_abs_indices = []

            if test_values.shape[0] == 0:
                print(f"  Skipping {dataset_basename}, no test data after split.")
            else:
                min_train_samples = getattr(lof_detector, 'n_neighbors', 20) 
                if train_values.shape[0] < min_train_samples:
                    print(f"  Warning: Not enough training samples ({train_values.shape[0]}) for LOF (needs at least {min_train_samples}). Scores will not be generated.")
                else:
                    print(f"  Fitting LOF on {train_values.shape[0]} training samples...")
                    lof_detector.fit(train_values)

                    print(f"  Scoring with LOF on {test_values.shape[0]} test samples...")
                    anomaly_scores_test = lof_detector.score(test_values)
                    
                    if not isinstance(anomaly_scores_test, np.ndarray) or anomaly_scores_test.shape[0] != test_values.shape[0]:
                        print(f"  Warning: Anomaly scores shape mismatch for {dataset_basename}. Expected {test_values.shape[0]}, got {anomaly_scores_test.shape if isinstance(anomaly_scores_test, np.ndarray) else 'N/A'}.")
                    elif anomaly_scores_test.size == 0:
                         print(f"  Warning: LOF returned empty scores for {dataset_basename}.")
                    else:
                        anomaly_scores_test_flat = anomaly_scores_test.flatten()
                        padded_scores[training_split_idx : training_split_idx + len(anomaly_scores_test_flat)] = anomaly_scores_test_flat

                        mean_score = np.mean(anomaly_scores_test_flat)
                        std_score = np.std(anomaly_scores_test_flat)
                        threshold = mean_score + threshold_value * std_score
                        binary_predictions_test = (anomaly_scores_test_flat > threshold).astype(int)
                        
                        if binary_predictions_test.size > 0:
                            detected_indices_in_test = np.where(binary_predictions_test == 1)[0]
                            if detected_indices_in_test.size > 0:
                                all_detected_anomaly_abs_indices = [training_split_idx + idx for idx in detected_indices_in_test]
                                print(f"  {len(all_detected_anomaly_abs_indices)} anomalies detected by LOF at indices (relative to full data): {all_detected_anomaly_abs_indices}")
                            else:
                                print(f"  No anomaly detected by LOF with current threshold.")
                        else:
                            print(f"  No binary predictions generated from scores.")
            
            visualize_detector_output(
                data=data_df, 
                anomaly_scores=padded_scores, 
                detected_anomaly_indices=all_detected_anomaly_abs_indices,
                anomaly_range=anomaly_range, 
                detector_name="LOF"
            )
        except Exception as e:
            print(f"  Error processing dataset {dataset_basename}: {e}")
            import traceback
            traceback.print_exc()
