### Video Processing

In [1]:
FRAMEWORK_PATH = "./static/results/framework/"
MIN_VIDEOS = 20

import json
import os

from preprocess import pre_process_videos

def get_muffin_video_transcripts():
    library_metadata = {}
    with open("./metadata.json") as f:
        library_metadata = json.load(f)

    task_metadata = library_metadata["muffins"]

    muffin_videos = pre_process_videos(task_metadata["videos"])
    
    transcripts = []
    for video in muffin_videos:
        url = f"https://www.youtube.com/watch?v={video.video_id}"
        title = video.metadata["title"]
        content = ""
        transcript = []
        for sentence in video.sentences:
            if sentence['text'].strip() == "":
                continue
            content += f"{sentence['text']}\n"
            transcript.append({
                "text": sentence['text'],
                "start": sentence['start'],
                "end": sentence['finish'],
            })

        transcripts.append({
            "url": url,
            "title": "Making Muffins",
            "original_title": title,
            "content": content,
            "transcript": transcript,
        })
    return transcripts

def get_muffin_articles():
    database_path = "./static/datasets/muffin_articles/"
    articles = []
    
    for filename in os.listdir(database_path):
        with open(database_path + filename) as f:
            ### read line-by-line
            url = f.readline()
            title = f.readline()
            content = ""
            transcript = []
            for idx, line in enumerate(f):
                if line.strip() == "":
                    continue
                content += line
                transcript.append({
                    "text": line.strip(),
                    "start": idx,
                    "end": idx + 1,
                })

            articles.append({
                "url": url,
                "original_title": title,
                "title": "Making Muffins",
                "content": content,
                "transcript": transcript,
            })
    return articles

def get_dataset_muffins(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset

    dataset = get_muffin_articles()
    dataset = dataset + get_muffin_video_transcripts()
    print(f"Number of articles: {len(dataset)}")

    # dataset = add_info_labels_to_dataset(dataset, task)

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

### Handle CrossTask data;
import csv
from helpers.video_scripts import extract_transcript

def library_cross_task():
    library = []
    PATH = "./static/datasets/crosstask/"
    library_path = os.path.join(PATH, "library.json")
    
    if os.path.exists(library_path):
        with open(library_path, "r") as f:
            library = json.load(f)
            return library

    tasks_path = os.path.join(PATH, "crosstask_release/tasks_primary.txt")
    videos_path = os.path.join(PATH, "crosstask_release/videos.csv")
    videos_val_path = os.path.join(PATH, "crosstask_release/videos_val.csv")

    """
    Task ID
    Task name
    URL of corresponding WikiHow page
    Number of steps
    Ordered list of comma-separated steps of the task
    """
    task_obj_ids = ["task_id", "task_name", "url", "num_steps", "steps"]

    with open(tasks_path) as f:
        lines = f.readlines()
        for start_idx in range(0, len(lines), 6):
            cur_task = {}
            finished = False
            for idx, task_obj_id in enumerate(task_obj_ids):
                if start_idx + idx >= len(lines):
                    finished = True
                    break
                cur_task[task_obj_id] = lines[start_idx + idx].strip()
            if finished is False:
                library.append(cur_task)

    for task in library:
        task["steps"] = task["steps"].split(",")
        task["videos"] = []

    for videos_path in [videos_path, videos_val_path]:
        with open(videos_path) as f:
            reader = csv.reader(f)
            for row in reader:
                task_id = row[0]
                video_id = row[1]
                video_url = row[2]
                for task in library:
                    if task["task_id"] == task_id:
                        task["videos"].append({
                            "video_id": video_id,
                            "video_url": video_url,
                        })

    def get_language(video_subtitles_path):
        with open(video_subtitles_path) as f:
            lines = f.readlines()
            for line in lines:
                if "Language:" in line:
                    return line.split(":")[1].strip()
        return None


    SUBTITLES_PATH = os.path.join(PATH, "subtitles")
    for task in library:
        for video in task["videos"]:
            video_id = video["video_id"]
            video_subtitles_path = os.path.join(SUBTITLES_PATH, f"{video_id}.vtt")
            video["subtitles"] = []

            language = get_language(video_subtitles_path)
            if language == "en":
                video["subtitles"] = extract_transcript(video_subtitles_path, None)

    ANNOTATIONS_PATH = os.path.join(PATH, "crosstask_release/annotations/")

    for task in library:
        for video in task["videos"]:
            video["annotations"] = []
            annotation_path = os.path.join(ANNOTATIONS_PATH, f"{task['task_id']}_{video['video_id']}.csv")
            if os.path.exists(annotation_path):
                with open(annotation_path) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        video["annotations"].append({
                            "step": float(row[0]),
                            "start": float(row[1]),
                            "end": float(row[2]),
                        })
            else:
                print(f"No annotation found for {task['task_id']}_{video['video_id']}")

    ### label subtitles with step
    for task in library:
        for video in task["videos"]:
            annotated_subtitles = []
            for subtitle in video["subtitles"]:
                cur_step = None
                for annotation in video["annotations"]:
                    if subtitle["start"] >= annotation["start"] and subtitle["finish"] <= annotation["end"]:
                        cur_step = task["steps"][int(annotation["step"]) - 1]
                        break
                annotated_subtitles.append({
                    **subtitle,
                    "step": cur_step,
                })
            video["subtitles"] = annotated_subtitles

    ### restructure to be similar to the `dataset`

    ### save library as json
    with open(library_path, "w") as f:
        json.dump(library, f, indent=4)


def get_dataset_cross_task(task):
    """
    return dataset with the given task with a structure similar to the `dataset`
    """
    library = library_cross_task()
    dataset = []
    for _task in library:
        if _task["task_name"] == task:
            for video in _task["videos"]:
                content = ""
                transcript = []
                for subtitle in video["subtitles"]:
                    content += f"{subtitle['text']} "
                    transcript.append({
                        "text": subtitle['text'],
                        "start": subtitle['start'],
                        "end": subtitle['finish'],
                    })
                dataset.append({
                    "id": video["video_id"],
                    "url": video["video_url"],
                    "title": task,
                    "original_title": video["task"],
                    "content": content,
                    "transcript": transcript,
                    "steps": [],
                    "ipo": [],
                    "processed_ipos": [],
                })

    ### check if content is enough
    filtered_dataset = []
    for article in dataset:
        if len(article["content"]) < 100:
            continue
        filtered_dataset.append(article)
    dataset = filtered_dataset

    return dataset

def preprocess_cross_task(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset


    dataset = get_dataset_cross_task(task)
    print(f"Dataset for {task}: {len(dataset)}")

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

def preprocess_custom_dataset(task, dummy=""):
    dataset_filepath = f"{FRAMEWORK_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset
    
    custom_tasks_path = "./static/datasets/custom-dataset/videos_tasks_per_category.json"
    
    with open(custom_tasks_path) as f:
        custom_tasks = json.load(f)

    videos = []
    category = None
    
    for _category in custom_tasks:
    
        for task_info in custom_tasks[_category]:
            if len(task_info["videos"]) < MIN_VIDEOS:
                continue
            _task = task_info["task_details"]["title"]
    
            if _task == task:
                videos.extend(task_info["videos"])
                category = _category
    

    if category is None:
        raise ValueError(f"Task {task} not found in any category")

    dataset = []
    for video in videos:
        content = ""
        for subtitle in video["transcript"]:
            content += f"{subtitle['text']} "
        dataset.append({
            "id": video["id"],
            "url": "https://www.youtube.com/watch?v=" + video["id"],
            "title": task,
            "original_title": video["title"],
            "category": category,
            "content": content,
            "transcript": video["transcript"],
            "steps": [],
            "ipo": [],
            "processed_ipos": [],
        })

    ### check if content is enough
    filtered_dataset = []
    for article in dataset:
        if len(article["content"]) < 100:
            continue
        filtered_dataset.append(article)
    dataset = filtered_dataset

    print(f"Dataset for {task}: {len(dataset)}")

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset    

In [2]:
import os
import json
from prompts.stupid_experiment_3 import segment_transcript_stupid



def segment_videos(task, dataset, dummy = ""):
    path = os.path.join(FRAMEWORK_PATH, f'{task.replace(" ", "_").lower()}_segmentation_{dummy}.json')
    if os.path.exists(path):
        with open(path) as f:
            return json.load(f)
        
    for video in dataset:
        video['segmentation'] = segment_transcript_stupid(video['content'], TAXONOMY)

    with open(path, 'w') as f:
        json.dump(dataset, f, indent=4)
    return dataset


### Describing Scope (attempt 1)

In [None]:
from helpers.bert import bert_embedding, clustering_custom
from prompts.stupid_experiment_3 import form_information_units_few_shot

def build_information_units_v0(dataset, taskname, information_unit_similarity_threshold=0.8):
    parent_path = os.path.join(FRAMEWORK_PATH, f'{taskname}')
    if not os.path.exists(parent_path):
        os.makedirs(parent_path)

    path = os.path.join(parent_path, "information_units_v0.json")
    if os.path.exists(path):
        with open(path) as f:
            all_data = json.load(f)
            dataset = all_data["dataset"]
            information_units = all_data["information_units"]
            return dataset, information_units
    
    for video_idx, video in enumerate(dataset):
        if "pieces" in video:
            pieces = video["pieces"]
        else:
            ### forming the information units (conceptually should be easily redefinable)
            pieces = form_information_units_few_shot(video['title'], video['transcript'])
            video['pieces'] = []
            for i, piece in enumerate(pieces):
                video['pieces'].append({
                    "piece_id": f"{video_idx}_{i}",
                    "url": video['url'],
                    **piece,
                })

    all_pieces = []
    for video in dataset:
        for piece in video['pieces']:
            all_pieces.append(piece)

    #### cluster similar pieces in `all_pieces`
    information_units = {}

    unit_labels = clustering_custom([piece["content"] for piece in all_pieces], information_unit_similarity_threshold)
    for i, piece in enumerate(all_pieces):
        cur_unit_id = f"unit_{unit_labels[i]}"
        piece["unit_id"] = cur_unit_id
        if cur_unit_id not in information_units:
            ### first piece is the representative of the cluster (IU)
            information_units[cur_unit_id] = {
                "content": piece["content"],
                "content_type": piece["content_type"],
                "instances": [piece["piece_id"]],
            }
        else:
            information_units[cur_unit_id]["instances"].append(piece["piece_id"])

    with open(path, "w") as f:
        json.dump({"dataset": dataset, "information_units": information_units}, f, indent=4)

    return dataset, information_units

def build_codebook_v0(dataset, taskname, context_schema):
    """
    analyze each video and extract candidate labels for the schema
    cluster candidate labels to form codebook
    return codebook
    """
    path = os.path.join(FRAMEWORK_PATH, f'{taskname}', "codebook_v0.json")
    if os.path.exists(path):
        with open(path) as f:
            return json.load(f)

    ### TODO: code the `what_is_missing` parts to recognize candidate schemas and update the `context_schema`

    codebook = {}
    return codebook

def label_according_to_codebook_v0(dataset, codebook):
    """
    label information pieces in each video according to the codebook and return `piece-label` dictionary
    """
    cur_cim = {}
    return cur_cim

def critic_cim_v0(dataset, information_units, codebook, cim):
    """
    sample X labeled pieces and run a critic LLM to check if the context labels are sufficient. Provide the full video context to the critic LLM.
    """
    critic_results = {}
    return critic_results

def update_context_schema_v0(critic_results, context_schema, updated_schema):
    """
        critic_results = {
            "schema_name": {
                "piece_id": "label_name",
                "is_sufficient": "Yes/No",
                "what_is_missing": "missing_context",
            },
            ...
        }
        TODO: could consider other schemas beyond "what is missing" (e.g., interchangable? is contradicting?)
    """
    ### need to code the `what_is_missing` parts to recognize candidate schemas and update the `context_schema`
    last_update_proportion = 0
    new_context_schema = context_schema
    updated_schema = []

    ### TODO: code the `what_is_missing` parts to recognize candidate schemas and update the `context_schema`

    return last_update_proportion, new_context_schema, updated_schema

In [10]:
def process_videos_approach_1(task, dataset, stopping_criteria=0.1, dummy=""):
    taskname = f'{task.replace(" ", "_").lower()}_{dummy}'
    path = os.path.join(FRAMEWORK_PATH, taskname)
    if os.path.exists(path):
        with open(path) as f:
            return json.load(f)
    
    """
    approach_1_results = {
        "information_units": {
            "unit_id": {
                "content": "content_text",
                "content_type": "content_type",
                "instances": [piece_id_1, piece_id_2, ...],
            }
        },
        "context_schema": {
            "schema_name": {
                "schema": "schema_name",
                "definition": "definition",
                "format": "format",
                "prompt": "prompt",
                "examples": [
                    {
                        "content": "content_text",
                        "label": "label_name",
                        "definition": "label_definition",
                    }
                ]
            },
        },
        "codebook": {
            "schema_name": [
                {
                    "label": "label_name",
                    "definition": "label_definition",
                    "examples": [content_1, content_2, ...]
                },
            ]
        },
        "cim": {
            "schema_name": {
                "piece_id": "label_name",
            }
            ...
        }
    }
    """

    ### build the `information units`
    dataset, information_units = build_information_units_v0(dataset, taskname)

    ### build the `context_schema`
    context_schema = {
        ### simple initial schema
        "action": {
            "schema": "action",
            "definition": "The action that is being performend in the the tutorial.",
            "format": "verb",
            "prompt": "Identify the all the actions that are being performed in the tutorial following the format {format}. The actions should be in the same language as the tutorial.",
            "examples": [
                {
                    "content": "Whisk well until the ingredients are well combined.",
                    "label": "whisk",
                    "definition": "The action of mixing the ingredients together.",
                },
                {
                    "content": "Sift in the flour with baking powder and a pinch of salt.",
                    "label": "sift in",
                    "definition": "The action of sifting the dry ingredients.",
                }
            ]
        }
    }
    updated_schema = ["action"]
    codebook = {}
    cim = {}

    last_update_proportion = 1

    # while last_update_proportion > stopping_criteria:
    #     ### build the codebook for each schema
    #     for schema in updated_schema:
    #         codebook[schema] = build_codebook_v0(dataset, context_schema[schema], taskname)
    #         cim[schema] = label_according_to_codebook_v0(dataset, codebook[schema], taskname)
        
    #     last_update_proportion = 0
    #     updated_schema = []
    #     critic_results = critic_cim_v0(dataset, information_units, codebook, cim)
    #     last_update_proportion, new_context_schema, updated_schema = update_context_schema_v0(critic_results, context_schema, updated_schema)

    #     context_schema = new_context_schema

    approach_1_results = {
        "dataset": dataset,
        "information_units": information_units,
        "context_schema": context_schema,
        "codebook": codebook,
        "cim": cim,
    }    

    with open(path, 'w') as f:
        json.dump(approach_1_results, f, indent=4)
    return approach_1_results

### Main

In [11]:
import random
import numpy as np
import json

def stats(dataset):
    non_zero_content_types = ["Method", "Description", "Explanation", "Supplementary"]

    per_content_type = {}
    for video in dataset:
        for i, piece in enumerate(video['baseline_results']):
            content_type = piece['content_type']
            if content_type not in per_content_type:
                per_content_type[content_type] = {
                    "count": 0,
                    "count_p_s": [],
                }
            per_content_type[content_type]["count"] += 1
            per_content_type[content_type]["count_p_s"].append(len(piece["procedure_segments"]))
            if content_type in non_zero_content_types and len(piece["procedure_segments"]) > 1 and content_type != "Method":
                ### print prev 2 and the next 2 pieces
                to_print = []
                for j in range(max(0, i - 2), min(len(video['baseline_results']), i + 3)):
                    if i == j:
                        to_print.append(f"[({video['baseline_results'][j]['content_type']}) {video['baseline_results'][j]['content']}]")
                        to_print.append(str(video['baseline_results'][j]['procedure_segments']))
                    else:
                        to_print.append(f"({video['baseline_results'][j]['content_type']}) {video['baseline_results'][j]['content']}")
                        to_print.append(str(video['baseline_results'][j]['procedure_segments']))
                # print("\n".join(to_print))
                # print("-" * 100)
    
    to_print = []
    for video in dataset:
        for piece in video['baseline_results']:
            if piece['content_type'] in non_zero_content_types and piece["content_type"] != "Method":
                print_str = ""
                print_str += piece['content_type'] + "\n"
                print_str += piece['content'] + "\n"
                print_str += str(piece['procedure_segments']) + "\n"
                print_str += str(piece['procedure_segments_clustered']) + "\n"
                print_str += "\n"
                to_print.append(print_str)
    
    ### shuffle to_print
    random.shuffle(to_print)
    print("\n".join(to_print))

In [12]:
MUFFIN_TASK = "Making Muffins"

"""
Make French Toast			10 steps / 272 videos
Make Irish Coffee			5 steps / 248 videos
Change a Tire				11 steps / 119 videos
Build (sim.) Floating Shelves		5 steps / 173 videos
"""
CROSS_TASK_TASKS = [
    "Change a Tire",
    "Build (sim.) Floating Shelves",
    "Make French Toast",
    "Make Irish Coffee",
]

CUSTOM_TASKS = [
    ### Food and Entertaining
    "How to Make a Sushi Roll",
    "How to Make Caramel Apples",
    "How to Make a Milkshake Without Ice Cream",
    "How to Grill Steak",
    "How to Make Scrambled Eggs in a Microwave",

    ### Home and Garden
    "How to Grow Hydrangea from Cuttings",
    "How to Grow a Pumpkin",
    "How to Clean Bathroom Tile",
    "How to Polish Stainless Steel",
    "How to Clean a Glass Top Stove",
    "How to Get Rid of a Wasp's Nest",

    # Holidays and Traditions
    "How to Plant a Living Christmas Tree",

    # Sports and Fitness
    "How to Wrap Your Hands for Boxing",
    "How to Catch Trout",

    # Arts and Entertainment
    "How to Make a Paper Hat",
]


def get_dataset(task):
    if task == MUFFIN_TASK:
        return get_dataset_muffins(task, "framework_raw")
    elif task in CROSS_TASK_TASKS:
        return preprocess_cross_task(task, "framework_raw")
    elif task in CUSTOM_TASKS:
        return preprocess_custom_dataset(task, "framework_raw")

def print_csv(dataset, ann_key="baseline_results", info_types=[]):
    filename = f"framework_raw_{'_'.join(ann_key.split(' '))}.csv"
    with open(filename, "w") as f:
        f.write("video_id,url,original_title,transcript_id,content_type,content,context\n")
        for v_id, video in enumerate(dataset):
            if ann_key not in video:
                continue
            for i, transcript in enumerate(video[ann_key]):
                cur_str = f"{v_id},{video['url'].strip()},{video['original_title'].strip()},{i},{transcript['content_type'].strip()},\"{transcript['content'].strip()}\","
                if "context_step" in transcript:
                    cur_str += f"\"{transcript['context_step'].strip()}\""
                else:
                    if transcript['content_type'] in info_types:
                        cur_str += "\"<empty>\""
                    else:
                        cur_str += "\"<not-assigned>\""
                cur_str += "\n"
                f.write(cur_str)

def main(task):
    dataset = get_dataset(task)

### Fragmentation Analysis

In [13]:
from helpers.bert import bert_embedding
import numpy as np
import matplotlib.pyplot as plt

def shannon_entropy(distribution):
    """
    Calculate the Shannon entropy of a distribution.
    """
    return -np.sum(distribution * np.log(distribution + 1e-10))

def trace_of_covariance(pieces):
    if len(pieces) == 0:
        return 0
    trace = 0
    mean = np.mean(pieces, axis=0)
    for i in range(len(pieces)):
        trace += (pieces[i] - mean) @ (pieces[i] - mean).T
    trace /= len(pieces)
    return trace

def compare_fragmentation(pieces, labels, label_vocab, distance_delta=0.01):
    """
    Check if global fragmentation is higher or lower than the labeled fragmentation.
    """
    similarity_matrix = np.zeros((len(pieces), len(pieces)))
    for i in range(len(pieces)):
        for j in range(i+1):
            similarity_matrix[i, j] = np.dot(pieces[i], pieces[j])
            similarity_matrix[j, i] = similarity_matrix[i, j]

    distance_buckets = np.arange(0, 1+distance_delta, distance_delta)
    
    global_distance_distribution = np.zeros(len(distance_buckets))
    for i in range(len(pieces)):
        for j in range(len(pieces)):
            if i == j:
                continue
            distance = similarity_matrix[i, j]
            bucket = 0
            for i in range(len(distance_buckets)):
                if distance >= distance_buckets[i]:
                    bucket = i
            global_distance_distribution[bucket] += 1
    
    ### print # of global distances
    print(f"# of global distances: {np.sum(global_distance_distribution)}")
    print(f"Trace of covariance: {trace_of_covariance(pieces)}")

    ### normalize global distance distribution
    global_distance_distribution = global_distance_distribution / np.sum(global_distance_distribution) * 100
    
    distance_distribution_per_label = {}

    for label in label_vocab:
        labeled_distance_distribution = np.zeros(len(distance_buckets))
        cur_pieces = []
        for i in range(len(pieces)):
            if labels[i] != label:
                continue
            cur_pieces.append(pieces[i])
            for j in range(len(pieces)):
                if labels[j] != label:
                    continue
                if i == j:
                    continue
                distance = similarity_matrix[i, j]
                bucket = 0
                for i in range(len(distance_buckets)):
                    if distance >= distance_buckets[i]:
                        bucket = i
                labeled_distance_distribution[bucket] += 1
        
        ### print # of labeled distances
        print(f"# of labeled distances for {label}: {np.sum(labeled_distance_distribution)}")
        print(f"Trace of covariance for {label}: {trace_of_covariance(cur_pieces)}")

        distance_distribution_per_label[label] = labeled_distance_distribution / np.sum(labeled_distance_distribution) * 100

    ### plot the distance distribution in the same plot
    plt.figure(figsize=(10, 5))
    plt.plot(distance_buckets * 100, global_distance_distribution, label="Global", linestyle="--")
    for label in label_vocab:
        plt.plot(distance_buckets * 100, distance_distribution_per_label[label], label=label)
    plt.legend()
    plt.show()

custom_labels = {
    0: "350F",
    1: "350F",
    2: "375F",
    3: "375F",
    4: "375F",
    5: "350F",
    6: "??",
    7: "??",
    8: "425F",
    9: "??",
    10: "425F",
}

def experiment_analysis(dataset):
    """
    Embed all information pieces in the dataset;
    """
    # content_types_to_include = ["Method", "Description", "Explanation", "Supplementary"]
    content_types_to_include = ["Method"]
    pieces = []
    labels = []
    label_vocab = []
    for idx, video in enumerate(dataset):
        label = custom_labels[idx]
        if label not in label_vocab:
            label_vocab.append(label)
        for piece in video['baseline_results']:
            if piece['content_type'] not in content_types_to_include:
                continue
            # label = piece['content_type']
            # if label not in label_vocab:
            #     label_vocab.append(label)
            pieces.append(piece['content'])
            labels.append(label)
    
    pieces = bert_embedding(pieces)
    compare_fragmentation(pieces, labels, label_vocab, 0.05)

#### output
```
# of global distances: 110555.0
Trace of covariance: 0.6744245930655941
# of labeled distances for 350F: 11341.0
Trace of covariance for 350F: 0.7017171226929282
# of labeled distances for 375F: 8099.0
Trace of covariance for 375F: 0.6654950787623723
# of labeled distances for ??: 2600.0
Trace of covariance for ??: 0.5681160431282193
# of labeled distances for 425F: 7224.0
Trace of covariance for 425F: 0.6282378954045913
```
![image](/home/bekzat/starlab/video-analysis/probe/server/static/results/frag-analysis-1.png)

### OUTPUT

In [None]:
import os

task = MUFFIN_TASK
# task = CUSTOM_TASKS[14]
# task = CROSS_TASK_TASKS[0]
# task = CROSS_TASK_TASKS[1]
# task = CUSTOM_TASKS[13]

### `Greeting`, `Overview`, `Method`, `Supplementary`, `Explanation`, `Description`, `Conclusion`, and `Miscellaneous`

important_information_types = ["Method", "Supplementary", "Explanation", "Description"]

dataset = get_dataset(task)
dataset, information_units = build_information_units_v0(dataset, "muffins_experiment_1")

context_schema = {
    ### simple initial schema
    "action": {
        "schema": "action",
        "definition": "The action that is being performend in the the tutorial.",
        "format": "verb",
        "prompt": "Identify the all the actions that are being performed in the tutorial following the format {format}. The actions should be in the same language as the tutorial.",
        "examples": [
            {
                "content": "Whisk well until the ingredients are well combined.",
                "label": "whisk",
                "definition": "The action of mixing the ingredients together.",
            },
            {
                "content": "Sift in the flour with baking powder and a pinch of salt.",
                "label": "sift in",
                "definition": "The action of sifting the dry ingredients.",
            }
        ]
    }
}


