In [2]:
import os
from typing import Dict, List
from collections import defaultdict
import json

import psycopg2
from psycopg2.extras import DictCursor

DATABASE_URI = os.getenv('DATABASE_URI')
TABLE_PREFIX = "prod_"
ANNOTATION_TABLE_NAME = TABLE_PREFIX + "annotations"
SUBTASK_TABLE_NAME = TABLE_PREFIX + "subtasks"
ANNOTATIONS_FOLDER = 'output'
ANNOTATIONS_FILE_PATH = 'annotations_output.json'

In [19]:
conn = psycopg2.connect(DATABASE_URI, sslmode="require")
cursor = conn.cursor(cursor_factory=DictCursor)

In [4]:
def defaultdict_to_regular(d):
    """ Recursively converts defaultdict to dict """
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_regular(v) for k, v in d.items()}
    return d

In [5]:
def fetch_annotations(cursor: psycopg2.extensions.cursor) -> Dict[str, Dict]:
    try:
        # Fetch all annotations
        query = f"""   
            SELECT a.video_filename, a.id, a.username, a.created_at, s.start_step, s.end_step, s.subtask, s.time_spent
            FROM {ANNOTATION_TABLE_NAME} a
            JOIN {SUBTASK_TABLE_NAME} s ON a.id = s.annotation_id"""

        cursor.execute(query)
        rows = cursor.fetchall()

        annotations = defaultdict(lambda: defaultdict(lambda: {
            "username": None,
            "created_at": None,
            "subtask_decomposition": [],
            "time_spent": 0,
        }))

        # Group the data by video filename
        for row in rows:
            # Using field names instead of positional indexing
            video_filename, annotation_id = row['video_filename'], row['id']

            annotations[video_filename][annotation_id]["username"] = row['username']
            annotations[video_filename][annotation_id]["created_at"] = row['created_at']
            annotations[video_filename][annotation_id]["subtask_decomposition"].append(
                (row['start_step'], row['end_step'], row['subtask'])
            )
            annotations[video_filename][annotation_id]["time_spent"] += row['time_spent']


    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Roll back the transaction in case of error
    finally:
        cursor.close()
        conn.close()

    return defaultdict_to_regular(annotations)


In [20]:
prod_annotations = fetch_annotations(cursor)
len(prod_annotations.keys())


200

In [8]:
def write_annotations_to_file(annotations: dict, filename: str):
    """Writes the annotations data to a JSON file inside the correct directory."""

    # Get the path to the correct 'task_decomposition' directory
    # We assume the notebook is running within 'task_decomposition/scripts', and we move up two levels
    current_dir = os.path.dirname(os.path.abspath('.'))  # This gets the parent directory of the current folder

    # Now define the correct path to 'task_decomposition/output'
    output_dir = os.path.join(current_dir, 'output')

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Full path to the output file
    file_path = os.path.join(output_dir, filename)
    
    # Debug print statement to check the actual file path
    print(f"Writing to file: {file_path}")

    # Write data to the file
    with open(file_path, 'w') as file:
        json.dump(annotations, file, indent=4, default=str)  # Use default=str to handle datetime objects
    
    print(f"Annotations successfully written to {file_path}")
    return file_path

In [21]:
write_annotations_to_file(prod_annotations, ANNOTATIONS_FILE_PATH)

Writing to file: /Users/jonathansalfity/Documents/dev/task_decomposition/task_decomposition/output/annotations_output.json
Annotations successfully written to /Users/jonathansalfity/Documents/dev/task_decomposition/task_decomposition/output/annotations_output.json


'/Users/jonathansalfity/Documents/dev/task_decomposition/task_decomposition/output/annotations_output.json'