In [3]:
import pandas as pd
import os
from IPython.display import Audio, display, clear_output
import ipywidgets as widgets

# Load data
root_path = "/Users/jackcdawson/Desktop/dev/sample_identifier"
df_anomalies = pd.read_csv(
    f"{root_path}/pipeline/2_quality_check/anomaly_scores/anomaly_scores_v1.csv"
)
df_labels = pd.read_csv(f"{root_path}/data/labels.csv")

# Merge to get full info
df_review = df_anomalies.merge(df_labels, on="id")
df_review["file_path"] = df_review["id"].apply(
    lambda x: f"{root_path}/data/canonical/{x}.wav"
)

# Create review tracking
df_review["reviewed"] = False
df_review["correct"] = None
df_review["new_category"] = None
df_review["new_sub_category"] = None

# Category options for reclassification
categories = df_labels["category"].unique().tolist() + ["DELETE"]
sub_categories_map = df_labels.groupby("category")["sub_category"].unique().to_dict()


class SampleReviewer:
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        self.current_idx = 0
        self.setup_ui()

    def setup_ui(self):
        # Navigation
        self.progress = widgets.Label(value=f"Sample 0 / {len(self.df)}")
        self.next_btn = widgets.Button(description="Next ‚Üí")
        self.prev_btn = widgets.Button(description="‚Üê Previous")
        self.skip_btn = widgets.Button(description="Skip")

        # Sample info
        self.info_output = widgets.Output()
        self.audio_output = widgets.Output()

        # Classification
        self.correct_btn = widgets.Button(
            description="‚úì Correct", button_style="success"
        )
        self.incorrect_btn = widgets.Button(
            description="‚úó Reclassify", button_style="warning"
        )
        self.delete_btn = widgets.Button(description="üóë Delete", button_style="danger")

        self.category_dropdown = widgets.Dropdown(
            options=categories, description="Category:"
        )
        self.subcat_dropdown = widgets.Dropdown(options=[], description="Sub-category:")
        self.reclassify_output = widgets.Output()

        # Bind events
        self.next_btn.on_click(lambda b: self.navigate(1))
        self.prev_btn.on_click(lambda b: self.navigate(-1))
        self.skip_btn.on_click(lambda b: self.navigate(1))
        self.correct_btn.on_click(self.mark_correct)
        self.incorrect_btn.on_click(self.show_reclassify)
        self.delete_btn.on_click(self.mark_delete)
        self.category_dropdown.observe(self.update_subcategories, names="value")

        # Layout
        nav_box = widgets.HBox(
            [self.prev_btn, self.skip_btn, self.next_btn, self.progress]
        )
        action_box = widgets.HBox(
            [self.correct_btn, self.incorrect_btn, self.delete_btn]
        )
        reclassify_box = widgets.VBox([self.category_dropdown, self.subcat_dropdown])

        self.ui = widgets.VBox(
            [
                nav_box,
                self.info_output,
                self.audio_output,
                action_box,
                self.reclassify_output,
            ]
        )

    def display_current(self):
        row = self.df.iloc[self.current_idx]

        with self.info_output:
            clear_output(wait=True)
            print(f"ID: {row['id']}")
            print(f"Current: {row['category']} / {row['sub_category']}")
            print(
                f"Anomaly Score: {row['anomaly_score']:.3f} (percentile: {row['anomaly_percentile']:.1%})"
            )
            print(f"Source: {row['source']}")
            if row["reviewed"]:
                status = (
                    "‚úì Correct"
                    if row["correct"]
                    else f"‚Üí {row['new_category']}/{row['new_sub_category']}"
                )
                print(f"Status: {status}")

        with self.audio_output:
            clear_output(wait=True)
            if os.path.exists(row["file_path"]):
                display(Audio(row["file_path"]))
            else:
                print("‚ö† Audio file not found")

        self.progress.value = f"Sample {self.current_idx + 1} / {len(self.df)}"

    def navigate(self, delta):
        self.current_idx = max(0, min(len(self.df) - 1, self.current_idx + delta))
        self.display_current()
        with self.reclassify_output:
            clear_output()

    def mark_correct(self, b):
        self.df.loc[self.current_idx, "reviewed"] = True
        self.df.loc[self.current_idx, "correct"] = True
        self.navigate(1)

    def mark_delete(self, b):
        self.df.loc[self.current_idx, "reviewed"] = True
        self.df.loc[self.current_idx, "correct"] = False
        self.df.loc[self.current_idx, "new_category"] = "DELETE"
        self.navigate(1)

    def show_reclassify(self, b):
        with self.reclassify_output:
            clear_output(wait=True)
            confirm_btn = widgets.Button(
                description="Confirm Reclassification", button_style="info"
            )
            confirm_btn.on_click(self.confirm_reclassify)
            display(
                widgets.VBox(
                    [self.category_dropdown, self.subcat_dropdown, confirm_btn]
                )
            )

    def update_subcategories(self, change):
        cat = change["new"]
        if cat in sub_categories_map:
            self.subcat_dropdown.options = sub_categories_map[cat].tolist()

    def confirm_reclassify(self, b):
        self.df.loc[self.current_idx, "reviewed"] = True
        self.df.loc[self.current_idx, "correct"] = False
        self.df.loc[self.current_idx, "new_category"] = self.category_dropdown.value
        self.df.loc[self.current_idx, "new_sub_category"] = self.subcat_dropdown.value
        self.navigate(1)

    def save_results(self, filename="anomaly_reviews_v1.csv"):
        save_path = f"{root_path}/pipeline/2_quality_check/anomaly_reviews/{filename}"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        self.df.to_csv(save_path, index=False)
        print(f"‚úì Saved to {save_path}")

    def show(self):
        self.display_current()
        display(self.ui)


# Create and show reviewer
reviewer = SampleReviewer(df_review)
reviewer.show()

Anomaly columns: ['id', 'anomaly_score', 'anomaly_percentile']
Labels columns: ['id', 'category', 'sub_category', 'temporal', 'source']
Anomalies shape: (501, 3)
Labels shape: (5113, 5)


VBox(children=(HBox(children=(Button(description='‚Üê Previous', style=ButtonStyle()), Button(description='Skip'‚Ä¶

In [5]:
# Save review results
reviewer.save_results("anomaly_reviews_v1.csv")

# Summary stats
print("\nReview Summary:")
print(f"Total reviewed: {reviewer.df['reviewed'].sum()}")
print(
    f"Marked correct: {(reviewer.df['reviewed'] & (reviewer.df['correct'] == True)).sum()}"
)
print(
    f"Marked for reclassification: {(reviewer.df['reviewed'] & (reviewer.df['correct'] == False)).sum()}"
)

‚úì Saved to /Users/jackcdawson/Desktop/dev/sample_identifier/pipeline/2_quality_check/anomaly_reviews/anomaly_reviews_v1.csv

Review Summary:
Total reviewed: 1
Marked correct: 0
Marked for reclassification: 1
