### Video Processing

In [1]:
FRAMEWORK_PATH = "./static/results/framework/"
MIN_VIDEOS = 20

import json
import os

from preprocess import pre_process_videos

def get_muffin_video_transcripts():
    library_metadata = {}
    with open("./metadata.json") as f:
        library_metadata = json.load(f)

    task_metadata = library_metadata["muffins"]

    muffin_videos = pre_process_videos(task_metadata["videos"])
    
    transcripts = []
    for video in muffin_videos:
        url = f"https://www.youtube.com/watch?v={video.video_id}"
        title = video.metadata["title"]
        content = ""
        transcript = []
        for sentence in video.sentences:
            if sentence['text'].strip() == "":
                continue
            content += f"{sentence['text']}\n"
            transcript.append({
                "text": sentence['text'],
                "start": sentence['start'],
                "end": sentence['finish'],
            })

        transcripts.append({
            "url": url,
            "title": "Making Muffins",
            "original_title": title,
            "content": content,
            "transcript": transcript,
        })
    return transcripts

def get_muffin_articles():
    database_path = "./static/datasets/muffin_articles/"
    articles = []
    
    for filename in os.listdir(database_path):
        with open(database_path + filename) as f:
            ### read line-by-line
            url = f.readline()
            title = f.readline()
            content = ""
            transcript = []
            for idx, line in enumerate(f):
                if line.strip() == "":
                    continue
                content += line
                transcript.append({
                    "text": line.strip(),
                    "start": idx,
                    "end": idx + 1,
                })

            articles.append({
                "url": url,
                "original_title": title,
                "title": "Making Muffins",
                "content": content,
                "transcript": transcript,
            })
    return articles

def get_dataset_muffins(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset

    dataset = get_muffin_articles()
    dataset = dataset + get_muffin_video_transcripts()
    print(f"Number of articles: {len(dataset)}")

    # dataset = add_info_labels_to_dataset(dataset, task)

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

### Handle CrossTask data;
import csv
from helpers.video_scripts import extract_transcript

def library_cross_task():
    library = []
    PATH = "./static/datasets/crosstask/"
    library_path = os.path.join(PATH, "library.json")
    
    if os.path.exists(library_path):
        with open(library_path, "r") as f:
            library = json.load(f)
            return library

    tasks_path = os.path.join(PATH, "crosstask_release/tasks_primary.txt")
    videos_path = os.path.join(PATH, "crosstask_release/videos.csv")
    videos_val_path = os.path.join(PATH, "crosstask_release/videos_val.csv")

    """
    Task ID
    Task name
    URL of corresponding WikiHow page
    Number of steps
    Ordered list of comma-separated steps of the task
    """
    task_obj_ids = ["task_id", "task_name", "url", "num_steps", "steps"]

    with open(tasks_path) as f:
        lines = f.readlines()
        for start_idx in range(0, len(lines), 6):
            cur_task = {}
            finished = False
            for idx, task_obj_id in enumerate(task_obj_ids):
                if start_idx + idx >= len(lines):
                    finished = True
                    break
                cur_task[task_obj_id] = lines[start_idx + idx].strip()
            if finished is False:
                library.append(cur_task)

    for task in library:
        task["steps"] = task["steps"].split(",")
        task["videos"] = []

    for videos_path in [videos_path, videos_val_path]:
        with open(videos_path) as f:
            reader = csv.reader(f)
            for row in reader:
                task_id = row[0]
                video_id = row[1]
                video_url = row[2]
                for task in library:
                    if task["task_id"] == task_id:
                        task["videos"].append({
                            "video_id": video_id,
                            "video_url": video_url,
                        })

    def get_language(video_subtitles_path):
        with open(video_subtitles_path) as f:
            lines = f.readlines()
            for line in lines:
                if "Language:" in line:
                    return line.split(":")[1].strip()
        return None


    SUBTITLES_PATH = os.path.join(PATH, "subtitles")
    for task in library:
        for video in task["videos"]:
            video_id = video["video_id"]
            video_subtitles_path = os.path.join(SUBTITLES_PATH, f"{video_id}.vtt")
            video["subtitles"] = []

            language = get_language(video_subtitles_path)
            if language == "en":
                video["subtitles"] = extract_transcript(video_subtitles_path, None)

    ANNOTATIONS_PATH = os.path.join(PATH, "crosstask_release/annotations/")

    for task in library:
        for video in task["videos"]:
            video["annotations"] = []
            annotation_path = os.path.join(ANNOTATIONS_PATH, f"{task['task_id']}_{video['video_id']}.csv")
            if os.path.exists(annotation_path):
                with open(annotation_path) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        video["annotations"].append({
                            "step": float(row[0]),
                            "start": float(row[1]),
                            "end": float(row[2]),
                        })
            else:
                print(f"No annotation found for {task['task_id']}_{video['video_id']}")

    ### label subtitles with step
    for task in library:
        for video in task["videos"]:
            annotated_subtitles = []
            for subtitle in video["subtitles"]:
                cur_step = None
                for annotation in video["annotations"]:
                    if subtitle["start"] >= annotation["start"] and subtitle["finish"] <= annotation["end"]:
                        cur_step = task["steps"][int(annotation["step"]) - 1]
                        break
                annotated_subtitles.append({
                    **subtitle,
                    "step": cur_step,
                })
            video["subtitles"] = annotated_subtitles

    ### restructure to be similar to the `dataset`

    ### save library as json
    with open(library_path, "w") as f:
        json.dump(library, f, indent=4)


def get_dataset_cross_task(task):
    """
    return dataset with the given task with a structure similar to the `dataset`
    """
    library = library_cross_task()
    dataset = []
    for _task in library:
        if _task["task_name"] == task:
            for video in _task["videos"]:
                content = ""
                transcript = []
                for subtitle in video["subtitles"]:
                    content += f"{subtitle['text']} "
                    transcript.append({
                        "text": subtitle['text'],
                        "start": subtitle['start'],
                        "end": subtitle['finish'],
                    })
                dataset.append({
                    "id": video["video_id"],
                    "url": video["video_url"],
                    "title": task,
                    "original_title": video["task"],
                    "content": content,
                    "transcript": transcript,
                    "steps": [],
                    "ipo": [],
                    "processed_ipos": [],
                })

    ### check if content is enough
    filtered_dataset = []
    for article in dataset:
        if len(article["content"]) < 100:
            continue
        filtered_dataset.append(article)
    dataset = filtered_dataset

    return dataset

def preprocess_cross_task(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset


    dataset = get_dataset_cross_task(task)
    print(f"Dataset for {task}: {len(dataset)}")

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

def preprocess_custom_dataset(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset
    
    custom_tasks_path = "./static/datasets/custom-dataset/videos_tasks_per_category.json"
    
    with open(custom_tasks_path) as f:
        custom_tasks = json.load(f)

    videos = []
    category = None
    
    for _category in custom_tasks:
    
        for task_info in custom_tasks[_category]:
            if len(task_info["videos"]) < MIN_VIDEOS:
                continue
            _task = task_info["task_details"]["title"]
    
            if _task == task:
                videos.extend(task_info["videos"])
                category = _category
    

    if category is None:
        raise ValueError(f"Task {task} not found in any category")

    dataset = []
    for video in videos:
        content = ""
        for subtitle in video["transcript"]:
            content += f"{subtitle['text']} "
        dataset.append({
            "id": video["id"],
            "url": "https://www.youtube.com/watch?v=" + video["id"],
            "title": task,
            "original_title": video["title"],
            "category": category,
            "content": content,
            "transcript": video["transcript"],
            "steps": [],
            "ipo": [],
            "processed_ipos": [],
        })

    ### check if content is enough
    filtered_dataset = []
    for article in dataset:
        if len(article["content"]) < 100:
            continue
        filtered_dataset.append(article)
    dataset = filtered_dataset

    print(f"Dataset for {task}: {len(dataset)}")

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset    

### Describing Scope (attempt 1)

In [2]:
from helpers.bert import bert_embedding, clustering_custom
from prompts.stupid_experiment_3 import form_information_units

def build_information_units_v0(dataset, taskname, information_unit_similarity_threshold=0.8):
    parent_path = os.path.join(FRAMEWORK_PATH, f'{taskname}')
    if not os.path.exists(parent_path):
        os.makedirs(parent_path)

    path = os.path.join(parent_path, "information_units_v0.json")
    if os.path.exists(path):
        with open(path) as f:
            dataset = json.load(f)
    
    for video_idx, video in enumerate(dataset):
        if "pieces" in video:
            pieces = video["pieces"]
        else:
            ### forming the information units (conceptually should be easily redefinable)
            pieces = form_information_units(video['title'], video['transcript'])
            video['pieces'] = []
            for i, piece in enumerate(pieces):
                video['pieces'].append({
                    "piece_id": f"piece_{video_idx}_{i}",
                    **piece,
                    "labels": {},
                })

    ### TODO: maybe cluster for each type of information separately?
    all_pieces = []
    for video_idx, video in enumerate(dataset):
        for i, piece in enumerate(video['pieces']):
            piece["piece_id"] = f"piece_{video_idx}_{i}"
            all_pieces.append(piece)

    #### cluster similar pieces in `all_pieces`
    information_units = {}

    unit_labels = clustering_custom([piece["content"] for piece in all_pieces], information_unit_similarity_threshold)
    for i, piece in enumerate(all_pieces):
        cur_unit_id = f"unit_{unit_labels[i]}"
        piece["unit_id"] = cur_unit_id
        if cur_unit_id not in information_units:
            ### first piece is the representative of the cluster (IU)
            information_units[cur_unit_id] = {
                "content": piece["content"],
                "content_type": piece["content_type"],
                "instances": [piece["piece_id"]],
            }
        else:
            information_units[cur_unit_id]["instances"].append(piece["piece_id"])

    with open(path, "w") as f:
        json.dump(dataset, f, indent=4)

    return dataset, information_units

from prompts.stupid_experiment_3 import form_context_codebook, label_transcript_pieces

def build_codebook_v0(dataset, taskname, schema):
    """
    similar to VideoMix --> iteratively build the codebook
    TODO: may need to restrict the number of videos considered
    """
    schemaname = "context_" + schema["schema"].replace(" ", "_").lower()
    
    parent_path = os.path.join(FRAMEWORK_PATH, f'{taskname}')
    if not os.path.exists(parent_path):
        os.makedirs(parent_path)

    path = os.path.join(parent_path, f"{schemaname}_v0.json")
    if os.path.exists(path):
        with open(path) as f:
            schema = json.load(f)
            return schema

    ### iteratively build the context schema
    for video in dataset:
        new_labels = form_context_codebook(video["title"], video["transcript"], schema)
        schema["labels"] = new_labels

    with open(path, "w") as f:
        json.dump(schema, f, indent=4)

    return schema

def label_based_on_codebook_v0(dataset, schema):
    """
    label the dataset based on the codebook
    TODO: label multiple times per video
    """
    
    schema_name = schema["schema"]

    for video in dataset:
        labeled_pieces = label_transcript_pieces(video["title"], video["pieces"], schema)
        if len(labeled_pieces) != len(video["pieces"]):
            print(f"STRONG WARNING: {len(labeled_pieces)} != {len(video['pieces'])}")
        for piece_idx, piece in enumerate(video["pieces"]):
            if schema_name not in piece["labels"]:
                piece["labels"][schema_name] = []
            piece["labels"][schema_name].append(labeled_pieces[piece_idx])
    return dataset

def build_facet_candidates(dataset, schema):
    """
    VideoMix-like approach --> go over the biggest cells and identify facet candidates beyond what we have and try to update the list? combine similar candidates as much as possible?
    """
    pass

In [3]:
def calc_discriminativeness(context_schema, labeled_dataset, important_piece_types):
    ### TODO: should be adjusted by noise --> noise is proportional to the inter-labeling variability of the labeled dataset
    units_per_cell = {}
    for video in labeled_dataset:
        for piece in video["pieces"]:
            if piece["content_type"] not in important_piece_types:
                continue
            cell_id = ""
            for schema in context_schema:
                if schema["schema"] in piece["labels"]:
                    cur_labels = piece["labels"][schema["schema"]]
                    cell_id += "<" + cur_labels[0] + ">"
                else:
                    cell_id += "<None>"
                    print("WARNING: No label found for the piece ID.", piece["piece_id"])
            if cell_id not in units_per_cell:
                units_per_cell[cell_id] = {}
            if piece["unit_id"] not in units_per_cell[cell_id]:
                units_per_cell[cell_id][piece["unit_id"]] = 0
            units_per_cell[cell_id][piece["unit_id"]] += 1

    result = 0
    for cell_id in units_per_cell:
        if len(units_per_cell[cell_id]) == 0:
            continue
        result += len(units_per_cell[cell_id]) - 1 ### -1 because we don't count the cell itself
    return -result

def calc_compactness(context_schema):
    ### TODO: check if we prefer fewer facets or not
    result = 0
    for facet in context_schema:
        result += len(facet["labels"])
    result -= len(context_schema)
    return result

def calc_objective(context_schema, labeled_dataset, important_piece_types):
    ### combines discriminative and compactness
    d = calc_discriminativeness(context_schema, labeled_dataset, important_piece_types)
    c = calc_compactness(context_schema)
    return d / c, d, c

def macro_pruning(context_schema, labeled_dataset, important_piece_types, threshold=0.01):
    if len(context_schema) <= 1:
        return context_schema
    
    ### macroprune the context schema by removing the least discriminative facet
    o_best, d_best, c_best = calc_objective(context_schema, labeled_dataset, important_piece_types)
    facet_to_remove = None
    for i, facet in enumerate(context_schema):
        cur_context_schema = context_schema[:i] + context_schema[i+1:]
        o, d, c = calc_objective(cur_context_schema, labeled_dataset, important_piece_types)
        if o > o_best-threshold:
            o_best = max(o_best, o)
            facet_to_remove = i
    if facet_to_remove is None:
        return context_schema
    return context_schema[:facet_to_remove] + context_schema[facet_to_remove+1:]

def update_facet_candidates_and_labeled_dataset(
    context_schema, labeled_dataset, facet_candidates
):
    ### TODO: update the list of candidates & label the dataset...
    return facet_candidates, labeled_dataset

In [4]:

"""
    {label_structure} = {
        "title": {label title},
        "definition": {label_definition},
        "examples": [
            {
                "context": {text surrounding the content + content},
                "content": {content_text},
            }
        ]
    }
    {schema_structure} = {
        "schema": {facet title},
        "schema_plural": {facet plural},
        "definition": {definition of the schema/facet},
        
        "codebook_guidelines": [
            {guidelines for the LLM to generate the codebook}
            ...
        ],
        
        ### for labeling
        "labels": [
            {label_structure}
            ...
        ]
    }

    {labeled_dataset_structure} = {
        "url": {video url}
        "title": {task title},
        "original_title": {original video title},
        "content": {concatinated transcript},
        "transcript": [
            {
                "text": {text},
                "start": {start},
                "end": {end},
            },
            ...
        ],
        "pieces": {
            {piece_id}: {
                "piece_id": {piece_id},
                "unit_id": {unit_id},
                "content": {content of the piece},
                "content_type": {type of the content: Overview, Method, Explanation, Supplementary},
                "start": {start of the content},
                "end": {end of the content},
                "labels": {
                    {schema title}: [{label title}, ...], ### number of runs
                    ...
                }
            },
            ...
        }
    }

    approach_1_results = {
        "information_units": {
            {unit_id}: {
                "unit_id": {unit_id},
                "content": {content of the information unit},
                "content_type": {type of the content: Overview, Method, Explanation, Supplementary},
                "instances": [{id of the piece - piece_id}, ...],
            }
        },
        "context_schema": {
            {schema_name}: {schema_structure},
            ...
        },
        "facet_candidates": {
            {schema_name}: {schema_structure},
            ...
        },
        "labeled_dataset": [
            {labeled_dataset_structure},
            ...
        ]
    }
"""

def process_videos_approach_1(task, dataset, important_piece_types, stopping_delta_threshold=0.1, dummy=""):
    ### constants
    max_iterations = 100
    pruning_interval = 5
    pruning_threshold = 0.01
    max_macro_pruning_len = 2


    taskname = f'{task.replace(" ", "_").lower()}_{dummy}'

    ### build the `information units`
    labeled_dataset, information_units = build_information_units_v0(dataset, taskname, information_unit_similarity_threshold=0.8)

    ### Greedy Algorithm for constructing the context schema:

    facet_candidates = { ### start with subgoal & step
        "subgoal": {
            "schema": "subgoal",
            "schema_plural": "subgoals",
            "definition": "an intermediate objective or subtask that helps in achieving the final goal",
            "codebook_guidelines": [
                "{schema_plural} should be high-level and concise.",
                "Base each {schema} on meaningful intermediate outcome/result.",
                "Exclude any {schema_plural} unrelated to the core task, such as introductions, conclusions, or general commentary."
            ],
            "labels": [],
        },
        "step": {
            "schema": "step",
            "schema_plural": "steps",
            "definition": "key high-level steps involved in the task",
            "codebook_guidelines": [
                "{schema_plural} should be high-level and concise.",
                "Base each {schema} on an intermediate outcome with tangible results (e.g., \"Make Dough\", \"Grill Steak\"), instead of individual actions (e.g., \"Add Flour\", \"Turn on Grill\").",
                "Avoid using specific ingredients in the {schema} name (e.g., \"Add Tomato Paste\"). Instead, focus on the purpose of the {schema} (e.g., \"Make Sauce\" instead of \"Add Tomato Paste\").",
                "Group together related low-level actions into a single high-level {schema}. (e.g., combine \"Add Salt\" and \"Add Lime\" into \"Make Sauce\").",
                "A {schema} must span multiple transcript sentences, not just a single sentence. It should be high-level enough.",
                "Use a concise \"verb + object\" format to describe each {schema}, containing only one verb (e.g., \"Boil Potatoes\").",
                "Exclude any {schema_plural} unrelated to the core task, such as introductions, conclusions, or general commentary."
            ],
            "labels": [],
        }
    }
    context_schema = {}

    iterations = 0

    while True:
        iterations += 1
        if iterations > max_iterations:
            print("WARNING: Maximum number of iterations reached")
            break
        
        ### update the facet candidates
        facet_candidates, labeled_dataset = update_facet_candidates_and_labeled_dataset(context_schema, labeled_dataset, facet_candidates, important_piece_types)
        
        if len(facet_candidates) == 0:
            print("WARNING: No facet candidates left")
            break

        ### run pruning every 5 iterations
        if iterations % pruning_interval == 0:
            original_length = len(context_schema)
            while len(context_schema) > 1 and len(context_schema) > original_length - max_macro_pruning_len:
                new_context_schema = macro_pruning(
                    context_schema, labeled_dataset, important_piece_types, pruning_threshold
                )
                if len(new_context_schema) < len(context_schema):
                    context_schema = new_context_schema
                    continue
                else:
                    break            
        
        best_facet_toadd = None
        prev_o, prev_d, prev_c = calc_objective(context_schema, labeled_dataset, important_piece_types)
        best_o, best_d, best_c = prev_o, prev_d, prev_c
        for i, facet_candidate in enumerate(facet_candidates):
            ### get the labeling by the context schema
            candidate_context_schema = context_schema + [facet_candidate]
            ### calculate the discriminative and compactness
            o, d, c = calc_objective(candidate_context_schema, labeled_dataset, important_piece_types)
            if o > best_o:
                best_o = o
                best_facet_toadd = i

        if best_o < prev_o + stopping_delta_threshold:
            print("WARNING: Adding a facet didn't improve the objective `significantly`", best_o, prev_o, pruning_threshold)
            break

        if best_facet_toadd is None:
            print("WARNING: No best facet found / Seem to have converged")
            break
        context_schema.append(facet_candidates[best_facet_toadd])
        facet_candidates = facet_candidates[:best_facet_toadd] + facet_candidates[best_facet_toadd+1:]

    print("Completed in {} iterations".format(iterations))

    approach_1_results = {
        "information_units": information_units,
        "context_schema": context_schema,
        "facet_candidates": facet_candidates,
        "labeled_dataset": labeled_dataset,
    }

    return approach_1_results

### Main

In [5]:
import random
import numpy as np
import json

def stats(dataset):
    non_zero_content_types = ["Method", "Description", "Explanation", "Supplementary"]

    per_content_type = {}
    for video in dataset:
        for i, piece in enumerate(video['baseline_results']):
            content_type = piece['content_type']
            if content_type not in per_content_type:
                per_content_type[content_type] = {
                    "count": 0,
                    "count_p_s": [],
                }
            per_content_type[content_type]["count"] += 1
            per_content_type[content_type]["count_p_s"].append(len(piece["procedure_segments"]))
            if content_type in non_zero_content_types and len(piece["procedure_segments"]) > 1 and content_type != "Method":
                ### print prev 2 and the next 2 pieces
                to_print = []
                for j in range(max(0, i - 2), min(len(video['baseline_results']), i + 3)):
                    if i == j:
                        to_print.append(f"[({video['baseline_results'][j]['content_type']}) {video['baseline_results'][j]['content']}]")
                        to_print.append(str(video['baseline_results'][j]['procedure_segments']))
                    else:
                        to_print.append(f"({video['baseline_results'][j]['content_type']}) {video['baseline_results'][j]['content']}")
                        to_print.append(str(video['baseline_results'][j]['procedure_segments']))
                # print("\n".join(to_print))
                # print("-" * 100)
    
    to_print = []
    for video in dataset:
        for piece in video['baseline_results']:
            if piece['content_type'] in non_zero_content_types and piece["content_type"] != "Method":
                print_str = ""
                print_str += piece['content_type'] + "\n"
                print_str += piece['content'] + "\n"
                print_str += str(piece['procedure_segments']) + "\n"
                print_str += str(piece['procedure_segments_clustered']) + "\n"
                print_str += "\n"
                to_print.append(print_str)
    
    ### shuffle to_print
    random.shuffle(to_print)
    print("\n".join(to_print))

In [6]:
MUFFIN_TASK = "Making Muffins"

"""
Make French Toast			10 steps / 272 videos
Make Irish Coffee			5 steps / 248 videos
Change a Tire				11 steps / 119 videos
Build (sim.) Floating Shelves		5 steps / 173 videos
"""
CROSS_TASK_TASKS = [
    "Change a Tire",
    "Build (sim.) Floating Shelves",
    "Make French Toast",
    "Make Irish Coffee",
]

CUSTOM_TASKS = [
    ### Food and Entertaining
    "How to Make a Sushi Roll",
    "How to Make Caramel Apples",
    "How to Make a Milkshake Without Ice Cream",
    "How to Grill Steak",
    "How to Make Scrambled Eggs in a Microwave",

    ### Home and Garden
    "How to Grow Hydrangea from Cuttings",
    "How to Grow a Pumpkin",
    "How to Clean Bathroom Tile",
    "How to Polish Stainless Steel",
    "How to Clean a Glass Top Stove",
    "How to Get Rid of a Wasp's Nest",

    # Holidays and Traditions
    "How to Plant a Living Christmas Tree",

    # Sports and Fitness
    "How to Wrap Your Hands for Boxing",
    "How to Catch Trout",

    # Arts and Entertainment
    "How to Make a Paper Hat",
]


def get_dataset(task):
    if task == MUFFIN_TASK:
        return get_dataset_muffins(task, "framework_raw")
    elif task in CROSS_TASK_TASKS:
        return preprocess_cross_task(task, "framework_raw")
    elif task in CUSTOM_TASKS:
        return preprocess_custom_dataset(task, "framework_raw")

def print_csv(dataset, ann_key="baseline_results", info_types=[]):
    filename = f"framework_raw_{'_'.join(ann_key.split(' '))}.csv"
    with open(filename, "w") as f:
        f.write("video_id,url,original_title,transcript_id,content_type,content,context\n")
        for v_id, video in enumerate(dataset):
            if ann_key not in video:
                continue
            for i, transcript in enumerate(video[ann_key]):
                cur_str = f"{v_id},{video['url'].strip()},{video['original_title'].strip()},{i},{transcript['content_type'].strip()},\"{transcript['content'].strip()}\","
                if "context_step" in transcript:
                    cur_str += f"\"{transcript['context_step'].strip()}\""
                else:
                    if transcript['content_type'] in info_types:
                        cur_str += "\"<empty>\""
                    else:
                        cur_str += "\"<not-assigned>\""
                cur_str += "\n"
                f.write(cur_str)

def main(task):
    dataset = get_dataset(task)

### Fragmentation Analysis

In [7]:
from helpers.bert import bert_embedding
import numpy as np
import matplotlib.pyplot as plt

def shannon_entropy(distribution):
    """
    Calculate the Shannon entropy of a distribution.
    """
    return -np.sum(distribution * np.log(distribution + 1e-10))

def trace_of_covariance(pieces):
    if len(pieces) == 0:
        return 0
    trace = 0
    mean = np.mean(pieces, axis=0)
    for i in range(len(pieces)):
        trace += (pieces[i] - mean) @ (pieces[i] - mean).T
    trace /= len(pieces)
    return trace

def compare_fragmentation(pieces, labels, label_vocab, distance_delta=0.01):
    """
    Check if global fragmentation is higher or lower than the labeled fragmentation.
    """
    similarity_matrix = np.zeros((len(pieces), len(pieces)))
    for i in range(len(pieces)):
        for j in range(i+1):
            similarity_matrix[i, j] = np.dot(pieces[i], pieces[j])
            similarity_matrix[j, i] = similarity_matrix[i, j]

    distance_buckets = np.arange(0, 1+distance_delta, distance_delta)
    
    global_distance_distribution = np.zeros(len(distance_buckets))
    for i in range(len(pieces)):
        for j in range(len(pieces)):
            if i == j:
                continue
            distance = similarity_matrix[i, j]
            bucket = 0
            for i in range(len(distance_buckets)):
                if distance >= distance_buckets[i]:
                    bucket = i
            global_distance_distribution[bucket] += 1
    
    ### print # of global distances
    print(f"# of global distances: {np.sum(global_distance_distribution)}")
    print(f"Trace of covariance: {trace_of_covariance(pieces)}")

    ### normalize global distance distribution
    global_distance_distribution = global_distance_distribution / np.sum(global_distance_distribution) * 100
    
    distance_distribution_per_label = {}

    for label in label_vocab:
        labeled_distance_distribution = np.zeros(len(distance_buckets))
        cur_pieces = []
        for i in range(len(pieces)):
            if labels[i] != label:
                continue
            cur_pieces.append(pieces[i])
            for j in range(len(pieces)):
                if labels[j] != label:
                    continue
                if i == j:
                    continue
                distance = similarity_matrix[i, j]
                bucket = 0
                for i in range(len(distance_buckets)):
                    if distance >= distance_buckets[i]:
                        bucket = i
                labeled_distance_distribution[bucket] += 1
        
        ### print # of labeled distances
        print(f"# of labeled distances for {label}: {np.sum(labeled_distance_distribution)}")
        print(f"Trace of covariance for {label}: {trace_of_covariance(cur_pieces)}")

        distance_distribution_per_label[label] = labeled_distance_distribution / np.sum(labeled_distance_distribution) * 100

    ### plot the distance distribution in the same plot
    plt.figure(figsize=(10, 5))
    plt.plot(distance_buckets * 100, global_distance_distribution, label="Global", linestyle="--")
    for label in label_vocab:
        plt.plot(distance_buckets * 100, distance_distribution_per_label[label], label=label)
    plt.legend()
    plt.show()

custom_labels = {
    0: "350F",
    1: "350F",
    2: "375F",
    3: "375F",
    4: "375F",
    5: "350F",
    6: "??",
    7: "??",
    8: "425F",
    9: "??",
    10: "425F",
}

def experiment_analysis(dataset):
    """
    Embed all information pieces in the dataset;
    """
    # content_types_to_include = ["Method", "Description", "Explanation", "Supplementary"]
    content_types_to_include = ["Method"]
    pieces = []
    labels = []
    label_vocab = []
    for idx, video in enumerate(dataset):
        label = custom_labels[idx]
        if label not in label_vocab:
            label_vocab.append(label)
        for piece in video['baseline_results']:
            if piece['content_type'] not in content_types_to_include:
                continue
            # label = piece['content_type']
            # if label not in label_vocab:
            #     label_vocab.append(label)
            pieces.append(piece['content'])
            labels.append(label)
    
    pieces = bert_embedding(pieces)
    compare_fragmentation(pieces, labels, label_vocab, 0.05)

#### output
```
# of global distances: 110555.0
Trace of covariance: 0.6744245930655941
# of labeled distances for 350F: 11341.0
Trace of covariance for 350F: 0.7017171226929282
# of labeled distances for 375F: 8099.0
Trace of covariance for 375F: 0.6654950787623723
# of labeled distances for ??: 2600.0
Trace of covariance for ??: 0.5681160431282193
# of labeled distances for 425F: 7224.0
Trace of covariance for 425F: 0.6282378954045913
```
![image](/home/bekzat/starlab/video-analysis/probe/server/static/results/frag-analysis-1.png)

### OUTPUT

In [None]:
import os

task = MUFFIN_TASK
# task = CUSTOM_TASKS[14]
# task = CROSS_TASK_TASKS[0]
# task = CROSS_TASK_TASKS[1]
# task = CUSTOM_TASKS[13]

### `Greeting`, `Overview`, `Method`, `Supplementary`, `Explanation`, `Description`, `Conclusion`, and `Miscellaneous`

important_information_types = ["Method", "Supplementary", "Explanation", "Description"]
taskname = task.replace(" ", "_").lower() + "_experiment_1"

dataset = get_dataset(task)

# results = process_videos_approach_1(task, dataset, important_information_types, 0.1, "experiment_1")
# with open(f"{FRAMEWORK_PATH}{taskname}.json", "w") as f:
#     json.dump(dataset, f, indent=4)

dataset, information_units = build_information_units_v0(dataset, taskname, information_unit_similarity_threshold=0.8)

dataset = dataset[7:]

facet_candidates = { ### start with subgoal & step
    "subgoal": {
        "schema": "subgoal",
        "schema_plural": "subgoals",
        "definition": "an intermediate objective or subtask that helps in achieving the final goal",
        "codebook_guidelines": [
            "Subgoals should be high-level and concise.",
            "Base each subgoal on meaningful intermediate outcome/result.",
            "Exclude any subgoals unrelated to the core task, such as introductions, conclusions, or general commentary."
        ],
        "labels": [],
    },
    "step": {
        "schema": "step",
        "schema_plural": "steps",
        "definition": "key high-level steps involved in the task",
        "codebook_guidelines": [
            "Steps should be high-level and concise.",
            "Base each step on an intermediate outcome with tangible results (e.g., \"Make Dough\", \"Grill Steak\"), instead of individual actions (e.g., \"Add Flour\", \"Turn on Grill\").",
            "Avoid using specific ingredients in the step name (e.g., \"Add Tomato Paste\"). Instead, focus on the purpose of the step (e.g., \"Make Sauce\" instead of \"Add Tomato Paste\").",
            "Group together related low-level actions into a single high-level step. (e.g., combine \"Add Salt\" and \"Add Lime\" into \"Make Sauce\").",
            "A step must span multiple transcript sentences, not just a single sentence. It should be high-level enough.",
            "Use a concise \"verb + object\" format to describe each step, containing only one verb (e.g., \"Boil Potatoes\").",
            "Exclude any steps unrelated to the core task, such as introductions, conclusions, or general commentary."
        ],
        "labels": [],
    }
}

updated_schema = build_codebook_v0(dataset, taskname, facet_candidates["subgoal"])

labeled_pieces = label_based_on_codebook_v0(dataset, updated_schema)

print(json.dumps(labeled_pieces))

MESSAGES: [
  {
    "role": "system",
    "content": "\nYou are a helpful assistant who can understand and analyze tutorial videos."
  },
  {
    "role": "user",
    "content": "\nYou are analyzing a tutorial video for Making Muffins.\nYou are given a list of pieces of information from a tutorial (recipe, SOP, repair guide, etc.) along with a list of possible subgoals involved in the task. A subgoal is an intermediate objective or subtask that helps in achieving the final goal.\n\nYour task is to read through the pieces of information sequentially and label the pieces of information with the appropriate subgoal.\nThe subgoals may not be in order in the list, and some subgoals may not be used at all. Only assign a subgoal when the content clearly matches the subgoal. If it does not match any subgoal, leave it as empty string `\"\"`.\n\nHere is the list of subgoals:\n```\n[L1] Prepare Dry Ingredients\nDefinition: Mix dry ingredients like flour, baking powder, salt in a separate bowl\nExa

In [16]:
str1 =  "\nYou are analyzing a tutorial video for Making Muffins.\n\nYou are given a list of pieces of information from a tutorial (recipe, SOP, repair guide, etc.) along with a list of possible subgoals involved in the task. A subgoal is an intermediate objective or subtask that helps in achieving the final goal.\n\nYour task is to read through the pieces of information sequentially and label the pieces of information with the appropriate subgoal.\n\nThe subgoals may not be in order in the list, and some subgoals may not be used at all. Only assign a subgoal when the content clearly matches the subgoal. If it does not match any subgoal, leave it as empty string `\"\"`.\n\nHere is the list of subgoals:\n```\n[L1] Prepare Dry Ingredients\nDefinition: Mix dry ingredients like flour, baking powder, salt in a separate bowl\nExamples: \n\t- Context 0: Add 2 teaspoons of baking powder and 1 teaspoon of salt to the mixing bowl\n\t- Content 0: Adding baking powder and salt\n\n\t- Context 1: Add 2 cups of all-purpose flour in two stages during the mixing process\n\t- Content 1: Incorporating flour into the recipe\n[L2] Prepare Wet Ingredients\nDefinition: Combine and mix all liquid/wet ingredients in a single bowl\nExamples: \n\t- Context 0: Cream together 1 cup of softened salted butter and 1 cup of granulated sugar using a hand mixer\n\t- Content 0: Creaming butter and sugar\n\n\t- Context 1: Add 2 large eggs and mix briefly until just combined, about 30 seconds\n\t- Content 1: Adding eggs to wet ingredients\n\n\t- Context 2: Add 1 teaspoon of vanilla extract and mix into the butter-sugar mixture\n\t- Content 2: Incorporating vanilla extract\n[L3] Combine Ingredients\nDefinition: Gradually mix wet ingredients with dry ingredients to create batter\nExamples: \n\t- Context 0: Add half the flour, stir with a rubber spatula, then add half cup of milk or buttermilk\n\t- Content 0: Gradually incorporating flour and milk\n\n\t- Context 1: Stir ingredients together, scraping sides and bottom of bowl to ensure everything is mixed\n\t- Content 1: Ensuring complete ingredient integration\n[L4] Prepare Blueberries\nDefinition: Coat blueberries in flour to prevent sinking during baking\nExamples: \n\t- Context 0: Add 1 cup of fresh blueberries to the finished muffin batter and fold gently\n\t- Content 0: Incorporating blueberries into batter\n\n\t- Context 1: Optional: Add lemon zest along with blueberries for extra flavor\n\t- Content 1: Adding complementary flavors with blueberries\n[L5] Prepare Streusel Topping\nDefinition: Create a crumbly topping to add texture and flavor to muffins\nExamples: \n\t- Context 0: Mix 1/4 cup cold cubed butter, 1/2 cup brown sugar, 1/2 cup all-purpose flour, and 1/4 teaspoon ground cinnamon\n\t- Content 0: Mixing streusel ingredients\n\n\t- Context 1: Use a pastry cutter or two knives to create a coarse meal or sand-like texture\n\t- Content 1: Creating streusel texture\n[L6] Prepare for Baking\nDefinition: Line muffin tin and fill with batter, add toppings\nExamples: \n\t- Context 0: Line a 12-cup muffin tin with paper baking cups and spray with non-stick cooking spray\n\t- Content 0: Preparing muffin tin\n\n\t- Context 1: Fill each muffin cup about 2/3 full with batter, dividing evenly among 12 cups\n\t- Content 1: Filling muffin tin with batter\n\n\t- Context 2: Optional: Sprinkle granulated sugar or streusel topping on muffins before baking\n\t- Content 2: Adding toppings to muffins\n[L7] Preheat Oven\nDefinition: Set oven to the correct temperature for baking muffins\nExamples: \n\t- Context 0: Preheat oven to 425 degrees Fahrenheit before starting to prepare muffin batter\n\t- Content 0: Setting initial oven temperature\n\n\t- Context 1: After 7 minutes, reduce oven temperature to 350 degrees without opening the door\n\t- Content 1: Adjusting oven temperature during baking\n[L8] Bake Muffins\nDefinition: Bake muffins in the oven until cooked through\nExamples: \n\t- Context 0: Bake muffins at 425 degrees for 7 minutes, then reduce to 350 degrees for 12-15 minutes\n\t- Content 0: Baking muffins at specified temperatures\n\n\t- Context 1: Watch muffins carefully as cooking time may vary depending on your specific oven\n\t- Content 1: Monitoring muffin baking process\n```\n\nHere is the list of pieces of information with ids:\n```\npiece_7_0. Now that I've successfully learnt how to make muffins, I had to make a triple chocolate version.\npiece_7_1. Add all the wet ingredients to a bowl.\npiece_7_2. Whisk the wet ingredients until combined.\npiece_7_3. In another bowl, add the flour, cocoa powder, and caster sugar.\npiece_7_4. Whisk the flour, cocoa powder, and caster sugar together.\npiece_7_5. Add your chocolate chips.\npiece_7_6. Coating the chocolate chips in the flour helps stop them from sinking.\npiece_7_7. Fold the wet ingredients into the dry mix.\npiece_7_8. Refrigerate the mixture for at least one hour.\npiece_7_9. Refrigerating the mixture helps it rise without overflowing.\npiece_7_10. Scoop the batter into muffin cases.\npiece_7_11. Top the batter with a mix of chocolate chips.\npiece_7_12. Follow these baking instructions.\npiece_7_13. Enjoy!\n```\n\nReturn the pieces of information with labels in the same order as they were provided."


print(str1)


You are analyzing a tutorial video for Making Muffins.

You are given a list of pieces of information from a tutorial (recipe, SOP, repair guide, etc.) along with a list of possible subgoals involved in the task. A subgoal is an intermediate objective or subtask that helps in achieving the final goal.

Your task is to read through the pieces of information sequentially and label the pieces of information with the appropriate subgoal.

The subgoals may not be in order in the list, and some subgoals may not be used at all. Only assign a subgoal when the content clearly matches the subgoal. If it does not match any subgoal, leave it as empty string `""`.

Here is the list of subgoals:
```
[L1] Prepare Dry Ingredients
Definition: Mix dry ingredients like flour, baking powder, salt in a separate bowl
Examples: 
	- Context 0: Add 2 teaspoons of baking powder and 1 teaspoon of salt to the mixing bowl
	- Content 0: Adding baking powder and salt

	- Context 1: Add 2 cups of all-purpose flour i