In [2]:
from file_manager import MusicFileManager
from annotation import main, xls_to_df
import pandas as pd
import os
import csv

In [9]:
# --- Example Setup ---
data_dir = "data/"  # <--- IMPORTANT: SET THIS
output_annotations_csv = "pseudo_annotations.csv"
all_annotations_for_csv = []  # This will hold all annotations from all files

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

files_to_process = [f for f in os.listdir(data_dir) if f.lower().endswith(('.xls', '.xlsx'))]
if not files_to_process:
    print(f"No Excel files found in {data_dir}. Please add some files or check the path.")
    # exit() # Optional: exit if no files

for filename in files_to_process:
    print(f"\nProcessing file: {filename}")
    df_excel, sheet_name = xls_to_df(filename, base_dir=data_dir)

    if df_excel is not None and sheet_name is not None:
        # Reset annotations list for each file if you want separate annotation files per excel
        # Or use a global list like all_annotations_for_csv for one big file
        # For this example, we use the global all_annotations_for_csv

        print(f"Successfully read sheet '{sheet_name}' from {filename}")
        structured_data = main(
            df_excel,
            start_row_index=155,
            end_row_index=200,
            filename_for_ann=filename,
            sheetname_for_ann=sheet_name,
            annotations_list=all_annotations_for_csv,
            debug=True,  # Enable debug prints
            debug_location=True,
            debug_header=False
        )
        print(f"\n--- Structured Data for {filename} ---")
        print(structured_data.head())
        print("...")
    else:
        print(f"Could not process DataFrame from {filename}")


Processing file: RDO No. 1 - Laoag City, Ilocos Norte.xls
Successfully read sheet 'Sheet 9 (DO 047-2023)' from RDO No. 1 - Laoag City, Ilocos Norte.xls

Running find_location_components starting at df index 155

find_location_components: Processing df_row 155 (offset 0/2)

find_location_components: Processing df_row 156 (offset 1/2)

find_location_components: Processing df_row 157 (offset 2/2)

Running find_location_components starting at df index 156

find_location_components: Processing df_row 156 (offset 0/2)

find_location_components: Processing df_row 157 (offset 1/2)

find_location_components: Processing df_row 158 (offset 2/2)

Running find_location_components starting at df index 157

find_location_components: Processing df_row 157 (offset 0/2)

find_location_components: Processing df_row 158 (offset 1/2)

find_location_components: Processing df_row 159 (offset 2/2)

Running find_location_components starting at df index 158

find_location_components: Processing df_row 158 (off

In [4]:
LABEL_LOC_P = "LOC_P"
LABEL_LOC_C = "LOC_C"
LABEL_LOC_B = "LOC_B"
LABEL_HDR = "HDR"
LABEL_DATA = "DATA"
LABEL_BLANK = "BLANK"
LABEL_OTHER = "OTHER"
LABEL_TITLE = "TITLE"  # If you have logic to detect titles, not present in current main
LABEL_NOTE = "NOTE"  # If you have logic to detect notes, not present in current main

In [5]:
# --- Write all collected annotations to a single CSV file ---
if all_annotations_for_csv:
    # Deduplicate annotations (important if rows could be added multiple times by different logic paths)
    # A simple way is to convert to list of tuples and then to set and back, based on unique (file, sheet, row_index)
    seen_annotations = set()
    final_unique_annotations = []
    for ann in all_annotations_for_csv:
        # Create a unique key for each annotation entry
        # Using only row_index for uniqueness *within a sheet*
        # For global uniqueness, use (filename, sheetname, row_index)
        ann_key = (ann["filename"], ann["sheetname"], ann["row_index"])
        if ann_key not in seen_annotations:
            final_unique_annotations.append(ann)
            seen_annotations.add(ann_key)
        else:  # If seen, we might want to update if the new label is more specific, e.g. OTHER -> DATA
            # This requires more complex logic, for now, first one wins or last one based on order.
            # Let's make it so that more specific labels (not OTHER/BLANK) can overwrite.
            # Find existing and update if new is better
            for i, existing_ann in enumerate(final_unique_annotations):
                if (existing_ann["filename"], existing_ann["sheetname"], existing_ann["row_index"]) == ann_key:
                    # Prioritize more specific labels over generic ones
                    priority = {LABEL_LOC_P: 5, LABEL_LOC_C: 5, LABEL_LOC_B: 5, LABEL_HDR: 4, LABEL_DATA: 3,
                                LABEL_TITLE: 2, LABEL_NOTE: 2, LABEL_BLANK: 1, LABEL_OTHER: 0}
                    if priority.get(ann["label"], -1) > priority.get(existing_ann["label"], -1):
                        final_unique_annotations[i] = ann  # Update with more specific label
                    break

    # Sort by filename, sheetname, then row_index for consistent output
    final_unique_annotations.sort(key=lambda x: (x["filename"], x["sheetname"], x["row_index"]))

    print(f"\nWriting {len(final_unique_annotations)} pseudo-annotations to {output_annotations_csv}")
    annotation_df = pd.DataFrame(final_unique_annotations)
    annotation_df.to_csv(output_annotations_csv, index=False, quoting=csv.QUOTE_ALL)
    print("Annotation CSV created successfully.")
else:
    print("No annotations were generated.")


Writing 25346 pseudo-annotations to pseudo_annotations.csv
Annotation CSV created successfully.
