### Data handling

In [1]:
RESULTS_PATH = "./static/results/representation/"

### Handle muffins data;

import json
import os
import pandas as pd

from preprocess import pre_process_videos

from prompts.stupid_experiment_2 import extract_steps

def process_muffins_gt_csv():
    subgoals_row = 0
    ipo_row = 1
    url_id = "URL"
    var_id = "VARIATION"
    base = "base"

    gt_path = "./static/gts/"
    # Read the CSV file
    df = pd.read_csv(gt_path + 'muffins_gt.csv', header=None)

    dataset = []

    ## concatenat the subgoal row and ipo row
    new_headers = df.iloc[[subgoals_row, ipo_row]]
    df = df.drop([subgoals_row, ipo_row])

    last_url = None
    for ri, row in df.iterrows():
        general = {}
        cur_dataset = []
        cur_subgoal = None
        for ci in row.index:
            col = row[ci]
            if pd.isna(col):
                continue
            cur_header = new_headers.iloc[0, ci]
            if pd.isna(cur_header) is False:
                cur_subgoal = cur_header
            cur_subheader = new_headers.iloc[1, ci]

            if pd.isna(cur_subheader):
                if pd.isna(cur_header):
                    general[var_id] = col
                else:
                    general[cur_header] = col
            else:
                cur_dataset.append({
                    "subgoal": cur_subgoal,
                    "ipo": cur_subheader,
                    "content": col,
                })
        
        if url_id not in general or general[url_id] is None:
            general[url_id] = last_url
        else:
            last_url = general[url_id]
        for data in cur_dataset:
            dataset.append({
                **general,
                **data,
            })

    
    for data in dataset:
        if var_id not in data:
            data[var_id] = base
        ### reformat the "content"
        lines = data["content"].split("\n")
        raw_content = data["content"]
        information_list = []
        detail_list = []
        category_list = []
        
        for line in lines:
            if line.strip() == "":
                continue

            if line.strip().startswith("- -"):
                ### detail
                detail_list.append(line[3:].strip())
            elif line.strip().startswith("-"):
                ### information
                ### for process (the isntructions are roughly divided with `-->`)
                information_list.append(line[1:].strip())
            else:
                ### category
                category_list.append(line.strip())

        del data["content"]
        data["category"] = category_list
        data["information"] = information_list
        data["detail"] = detail_list
        data["raw_content"] = raw_content

    # reformat to pandas
    df = pd.DataFrame(dataset)
    return df

def get_muffin_video_transcripts():
    library_metadata = {}
    with open("./metadata.json") as f:
        library_metadata = json.load(f)

    task_metadata = library_metadata["muffins"]

    muffin_videos = pre_process_videos(task_metadata["videos"])
    
    transcripts = []
    for video in muffin_videos:
        url = f"https://www.youtube.com/watch?v={video.video_id}"
        title = video.metadata["title"]
        content = ""
        for sentence in video.sentences:
            content += f"{sentence['text']}\n"

        transcripts.append({
            "url": url,
            "title": title,
            "content": content
        })
    return transcripts

def get_muffin_articles():
    database_path = "./static/database/"
    prefix = "muffin_articles_"
    suffix = ".txt"
    articles = []
    
    for filename in os.listdir(database_path):
        if filename.startswith(prefix) and filename.endswith(suffix):
            with open(database_path + filename) as f:
                ### read line-by-line
                url = f.readline()
                title = f.readline()
                content = ""
                for line in f:
                    content += line

                articles.append({
                    "url": url,
                    "title": title,
                    "content": content
                })
    return articles

def add_steps_to_dataset(dataset, task):

    for article in dataset:
        tutorial = article["title"] + "\n" + article["content"]
        steps = extract_steps(task, tutorial)
        article["steps"] = steps

    return dataset

def get_dataset_muffins(task, dummy=""):
    dataset_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset

    dataset = get_muffin_articles()
    dataset = dataset + get_muffin_video_transcripts()
    print(f"Number of articles: {len(dataset)}")

    dataset = add_steps_to_dataset(dataset, task)

    # dataset = add_info_labels_to_dataset(dataset, task)

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

In [2]:
### Handle CrossTask data;

import os
import csv
import json
from helpers.video_scripts import extract_transcript

def library_cross_task():
    library = []
    PATH = "./static/datasets/crosstask/"
    library_path = os.path.join(PATH, "library.json")
    
    if os.path.exists(library_path):
        with open(library_path, "r") as f:
            library = json.load(f)
            return library

    tasks_path = os.path.join(PATH, "crosstask_release/tasks_primary.txt")
    videos_path = os.path.join(PATH, "crosstask_release/videos.csv")
    videos_val_path = os.path.join(PATH, "crosstask_release/videos_val.csv")

    """
    Task ID
    Task name
    URL of corresponding WikiHow page
    Number of steps
    Ordered list of comma-separated steps of the task
    """
    task_obj_ids = ["task_id", "task_name", "url", "num_steps", "steps"]

    with open(tasks_path) as f:
        lines = f.readlines()
        for start_idx in range(0, len(lines), 6):
            cur_task = {}
            finished = False
            for idx, task_obj_id in enumerate(task_obj_ids):
                if start_idx + idx >= len(lines):
                    finished = True
                    break
                cur_task[task_obj_id] = lines[start_idx + idx].strip()
            if finished is False:
                library.append(cur_task)

    for task in library:
        task["steps"] = task["steps"].split(",")
        task["videos"] = []

    for videos_path in [videos_path, videos_val_path]:
        with open(videos_path) as f:
            reader = csv.reader(f)
            for row in reader:
                task_id = row[0]
                video_id = row[1]
                video_url = row[2]
                for task in library:
                    if task["task_id"] == task_id:
                        task["videos"].append({
                            "video_id": video_id,
                            "video_url": video_url,
                        })

    def get_language(video_subtitles_path):
        with open(video_subtitles_path) as f:
            lines = f.readlines()
            for line in lines:
                if "Language:" in line:
                    return line.split(":")[1].strip()
        return None


    SUBTITLES_PATH = os.path.join(PATH, "subtitles")
    for task in library:
        for video in task["videos"]:
            video_id = video["video_id"]
            video_subtitles_path = os.path.join(SUBTITLES_PATH, f"{video_id}.vtt")
            video["subtitles"] = []

            language = get_language(video_subtitles_path)
            if language == "en":
                video["subtitles"] = extract_transcript(video_subtitles_path, None)

    ANNOTATIONS_PATH = os.path.join(PATH, "crosstask_release/annotations/")

    for task in library:
        for video in task["videos"]:
            video["annotations"] = []
            annotation_path = os.path.join(ANNOTATIONS_PATH, f"{task['task_id']}_{video['video_id']}.csv")
            if os.path.exists(annotation_path):
                with open(annotation_path) as f:
                    reader = csv.reader(f)
                    for row in reader:
                        video["annotations"].append({
                            "step": float(row[0]),
                            "start": float(row[1]),
                            "end": float(row[2]),
                        })
            else:
                print(f"No annotation found for {task['task_id']}_{video['video_id']}")

    ### label subtitles with step
    for task in library:
        for video in task["videos"]:
            annotated_subtitles = []
            for subtitle in video["subtitles"]:
                cur_step = None
                for annotation in video["annotations"]:
                    if subtitle["start"] >= annotation["start"] and subtitle["finish"] <= annotation["end"]:
                        cur_step = task["steps"][int(annotation["step"]) - 1]
                        break
                annotated_subtitles.append({
                    **subtitle,
                    "step": cur_step,
                })
            video["subtitles"] = annotated_subtitles

    ### restructure to be similar to the `dataset`

    ### save library as json
    with open(library_path, "w") as f:
        json.dump(library, f, indent=4)


def get_dataset_cross_task(task):
    """
    return dataset with the given task with a structure similar to the `dataset`
    """
    library = library_cross_task()
    dataset = []
    for _task in library:
        if _task["task_name"] == task:
            for video in _task["videos"]:
                content = ""
                for subtitle in video["subtitles"]:
                    content += f"{subtitle['text']}\n"
                dataset.append({
                    "id": video["video_id"],
                    "url": video["video_url"],
                    "title": task,
                    "content": content,
                    "steps": [],
                    "ipo": [],
                    "processed_ipos": [],
                })

    ### check if content is enough
    filtered_dataset = []
    for article in dataset:
        if len(article["content"]) < 100:
            continue
        filtered_dataset.append(article)
    dataset = filtered_dataset

    return dataset

def preprocess_cross_task(task, dummy=""):
    dataset_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset


    dataset = get_dataset_cross_task(task)
    print(f"Dataset for {task}: {len(dataset)}")
    dataset = add_steps_to_dataset(dataset, task)

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

### Global Vars

In [3]:
IPO_KEYS = ["inputs", "outputs", "methods"]
# INFORMATION_KEYS = ["description", "explanation", "tips", "alternatives"]
INFORMATION_KEYS = ["description", "instruction", "explanations", "tips"]

PRIMARY_INFORMATION_KEYS = ["description", "instruction"]
SUPP_INFORMATION_KEYS = ["explanations", "tips"]

KEY_LEVELS = {
    "r": 0,
    "phase": 1,
    "step": 2,
    "ipo": 3,
    "element": 4,
}

In [4]:
### Pipeline

from helpers.bert import hierarchical_clustering

from prompts.stupid_experiment_2 import aggregate_steps_stupid
from prompts.stupid_experiment_2 import extract_ipos
from prompts.stupid_experiment_2 import taxonomize_ipos_stupid
from prompts.stupid_experiment_2 import extract_information_per_ipo_stupid
from prompts.stupid_experiment_2 import cluster_information_stupid

def aggregate_hierarchical(items, task, distance_threshold=0.2):

    clusters = hierarchical_clustering(items, embedding_method="bert", linkage="average", n_clusters=None, distance_threshold=distance_threshold)

    # mappings = {}
    # for i, cluster in enumerate(clusters):
    #     if cluster not in mappings:
    #         mappings[cluster] = []
    #     mappings[cluster].append(items[i])

    # print(json.dumps(mappings, indent=4))
    return clusters

def construct_step_taxonomy(dataset, task, dummy=""):
    step_taxonomy_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_taxonomy_{dummy}.json"
    if os.path.exists(step_taxonomy_filepath):
        with open(step_taxonomy_filepath) as f:
            taxonomy = json.load(f)
        return taxonomy
    
    all_steps = []
    for article in dataset:
        for step in article["steps"]:
            all_steps.append({
                "step": step["step"],
                "description": step["description"],
                "original_tutorial": article["url"],
            })
    ### hierarchical clustering
    # taxonomy = aggregate_hierarchical(all_steps, task)

    ### LLM-based stupid aggregation
    taxonomy = aggregate_steps_stupid(task, all_steps)

    with open(step_taxonomy_filepath, "w") as f:
        json.dump(taxonomy, f, indent=4)
    return taxonomy

def extract_ipos_stupid(dataset, taxonomy, task, dummy=""):
    """
    TODO: can potentially simplify the input-output-instruction extraction, since we are taxonomizing anyway...
    """
    ipos_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_ipos_{dummy}.json"
    if os.path.exists(ipos_filepath):
        with open(ipos_filepath) as f:
            dataset = json.load(f)
        return dataset
    
    for article in dataset:
        tutorial = article["title"] + "\n" + article["content"]
        article["ipo"] = extract_ipos(task, taxonomy, tutorial)
    
    with open(ipos_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

# def reduce_variance_ipos(dataset, taxonomy, task):
#     """
#     just reduce the variance with hierarchical clustering
#     """

#     ipo_per_step = {}
#     for step in taxonomy:
#         ipo_per_step[step["step"]] = {
#             "phase": step["phase"],
#             "tutorials": [],
#         }

#     for article in dataset:
#         for step_info in article["ipo"]:
#             step = step_info["step"]
#             if step not in ipo_per_step:
#                 print(f"Error: Step {step} not found in taxonomy")
#                 continue
#             cur_entry = {}
#             for ipo_key in IPO_KEYS:
#                 if ipo_key not in step_info:
#                     print(f"Error: {ipo_key} not found in step {step}")
#                     continue
#                 cur_entry[ipo_key] = step_info[ipo_key]
#             ipo_per_step[step]["tutorials"].append(cur_entry)

#     ipo_taxonomy = {}

#     for step in ipo_per_step:
#         subtask = ipo_per_step[step]["phase"] + ": " + step
#         tutorials = ipo_per_step[step]["tutorials"]
#         # ipo_taxonomy[step] = taxonomize_ipos_stupid(task, tutorials, subtask)

#         objects = []
#         instruction_sets = []
#         for tutorial in tutorials:
#             if len(tutorial["inputs"]) > 0:
#                 objects.extend(tutorial["inputs"])
#             if len(tutorial["outputs"]) > 0:
#                 objects.extend(tutorial["outputs"])
#             if len(tutorial["methods"]) > 0:
#                 instruction_sets.append("; ".join(tutorial["methods"]))

#         aggregate_hierarchical(objects, subtask, distance_threshold=0.2)
#         aggregate_hierarchical(instruction_sets, subtask, distance_threshold=0.2)

def taxonomize_ipos(dataset, taxonomy, task, dummy=""):
    ipo_taxonomy_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_ipo_taxonomy_{dummy}.json"
    if os.path.exists(ipo_taxonomy_filepath):
        with open(ipo_taxonomy_filepath) as f:
            ipo_taxonomy = json.load(f)
        return ipo_taxonomy

    ipo_per_step = {}
    for step in taxonomy:
        ipo_per_step[step["step"]] = {
            "phase": step["phase"],
            "tutorials": [],
        }

    for article in dataset:
        for step_info in article["ipo"]:
            step = step_info["step"]
            if step not in ipo_per_step:
                print(f"Error: Step {step} not found in taxonomy")
                continue
            cur_entry = {}
            for ipo_key in IPO_KEYS:
                if ipo_key not in step_info:
                    print(f"Error: {ipo_key} not found in step {step}")
                    continue
                cur_entry[ipo_key] = step_info[ipo_key]
            ipo_per_step[step]["tutorials"].append(cur_entry)

    ipo_taxonomy = {}

    for step in ipo_per_step:
        subtask = ipo_per_step[step]["phase"] + ": " + step
        tutorials = ipo_per_step[step]["tutorials"]
        ipo_taxonomy[step] = taxonomize_ipos_stupid(task, tutorials, subtask)

    with open(ipo_taxonomy_filepath, "w") as f:
        json.dump(ipo_taxonomy, f, indent=4)
    return ipo_taxonomy


    ### TODO: Try each component separately
    # for article in dataset:
    #     for step_info in article["ipo"]:
    #         step = step_info["step"]
    #         if step not in ipo_per_step:
    #             print(f"Error: Step {step} not found in taxonomy")
    #             continue
    #         for ipo_key in IPO_KEYS:
    #             if ipo_key not in step_info:
    #                 print(f"Error: {ipo_key} not found in step {step}")
    #                 continue
    #             if ipo_key not in ipo_per_step[step]:
    #                 ipo_per_step[step][ipo_key] = []
    #             ipo_per_step[step][ipo_key].append({
    #                 "present": step_info["present"],
    #                 "set": step_info[ipo_key],
    #                 "original_tutorial": article["url"],
    #             })
    
    # ipo_taxonomy_per_step = {}

    # for step in ipo_per_step:
    #     if step not in ipo_taxonomy_per_step:
    #         ipo_taxonomy_per_step[step] = {
    #             "phase": ipo_per_step[step]["phase"],
    #         }

    #     # for ipo_key in IPO_KEYS:
    #     #     if ipo_key not in ipo_taxonomy_per_step[step]:
    #     #         ipo_taxonomy_per_step[step][ipo_key] = []
    #     #     if ipo_key not in ipo_per_step[step]:
    #     #         ipo_per_step[step][ipo_key] = []
    #     #         continue
    #     #     ### taxonomize the sets: enter the entire lists with some context and try to aggregate wrt other parts of the IPO

def extract_information_per_ipo(dataset, step_taxonomy, ipo_taxonomy, task, dummy=""):
    dataset_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_ipo_information_{dummy}.json"
    if os.path.exists(dataset_filepath):
        with open(dataset_filepath) as f:
            dataset = json.load(f)
        return dataset
    
    step_to_subtask = {}
    for step in step_taxonomy:
        step_to_subtask[step["step"]] = f"{step['phase']}: {step['step']}"

    async_calls = []

    for article in dataset:
        article["ipo_information"] = []
        for step_info in article["ipo"]:
            step = step_info["step"]
            if step not in ipo_taxonomy:
                print(f"Error: Step {step} not found in ipo_taxonomy")
                continue
            if step not in step_to_subtask:
                print(f"Error: Step {step} not found in step_to_subtask")
                continue
            tutorial = article["title"] + "\n" + article["content"]
            subtask = step_to_subtask[step]
            step_ipo_taxonomy = ipo_taxonomy[step]
            cur_information = extract_information_per_ipo_stupid(task, step_ipo_taxonomy, tutorial, subtask)
            article["ipo_information"].append({
                "step": step,
                ### ideally add the phase?
                "subtask": subtask,
                **cur_information,
            })
            # async_calls.append((task, step_ipo_taxonomy, tutorial, subtask))

    # print(f"Number of async calls: {len(async_calls)}")
    # async def process_call(task, step_ipo_taxonomy, tutorial, subtask):
    #     return await extract_information_per_ipo_stupid(task, step_ipo_taxonomy, tutorial, subtask)

    # loop = asyncio.get_event_loop()
    # results = await asyncio.gather(*[process_call(task, step_ipo_taxonomy, tutorial, subtask) for task, step_ipo_taxonomy, tutorial, subtask in async_calls])
    # for i, article in enumerate(dataset):
    #     for step_info in article["ipo"]:
    #         step = step_info["step"]
    #         if step not in ipo_taxonomy:
    #             print(f"Error: Step {step} not found in ipo_taxonomy")
    #             continue
    #         if step not in step_to_subtask:
    #             print(f"Error: Step {step} not found in step_to_subtask")
    #             continue
    #         subtask = step_to_subtask[step]
    #         cur_information = results[i]
    #         article["ipo_information"].append({
    #             "step": step,
    #             ### ideally add the phase?
    #             "subtask": subtask,
    #             **cur_information,
    #         })

    with open(dataset_filepath, "w") as f:
        json.dump(dataset, f, indent=4)
    return dataset

def extract_task_representation(dataset, step_taxonomy, ipo_taxonomy, task, agg_approach, agg_distance_threshold, dummy=""):
    representation_filepath = f"{RESULTS_PATH}{task.replace(' ', '_').lower()}_representation_{dummy}.json"

    if os.path.exists(representation_filepath):
        with open(representation_filepath) as f:
            representation = json.load(f)
        return representation
    
    step_to_phase = {}
    for step in step_taxonomy:
        step_to_phase[step["step"]] = step["phase"]

    ### covered articles; todo: remove later
    covered_articles = {}

    representation = {} # phase&step --> ipo_taxonomy --> elements --> information

    ### extract the task representation
    for article in dataset:
        if article["url"] in covered_articles:
            continue
        covered_articles[article["url"]] = 1
        for step_info in article["ipo_information"]:
            step = step_info["step"]
            phase = step_to_phase[step]
            if phase not in representation:
                representation[phase] = {}
            if step not in representation[phase]:
                representation[phase][step] = {}
            for ipo_key in IPO_KEYS:
                if ipo_key not in step_info:
                    continue
                if ipo_key not in representation[phase][step]:
                    representation[phase][step][ipo_key] = {}
                for element in step_info[ipo_key]:
                    element_name = element["name"]
                    is_present = element["present"]
                    if is_present is False:
                        continue
                    if element_name not in representation[phase][step][ipo_key]:
                        representation[phase][step][ipo_key][element_name] = []
                    for information_key in INFORMATION_KEYS:
                        if information_key not in element:
                            continue
                        if isinstance(element[information_key], list):
                            for text in element[information_key]:
                                representation[phase][step][ipo_key][element_name].append({
                                    "url": article["url"],
                                    "content": text,
                                    "type": information_key
                                })
                        else:
                            representation[phase][step][ipo_key][element_name].append({
                                "url": article["url"],
                                "content": element[information_key],
                                "type": information_key,
                            })
                    
    def aggregate_information(items, key, context):
        contents = []
        for item in items:
            contents.append(item["content"])
        if agg_approach == "clustering":
            clusters = aggregate_hierarchical(contents, context, distance_threshold=INFORMATION_AGGREGATION_DISTANCE_THRESHOLD)
        elif agg_approach == "llm-based":
            clusters = cluster_information_stupid(task, contents, context, key)

        result = []
        information_taxonomy = {}
        for idx, cluster in enumerate(clusters):
            if cluster not in information_taxonomy:
                information_taxonomy[cluster] = []
            information_taxonomy[cluster].append(items[idx])

        for cluster in information_taxonomy:
            cur_items = information_taxonomy[cluster]
            rep = {
                "type": key,
                "content": cur_items[0]["content"],
                "items": cur_items
            }
            result.append(rep)
        return result


    ### cluster similar element information
    for phase in representation:
        for step in representation[phase]:
            for ipo_key in representation[phase][step]:
                for element in representation[phase][step][ipo_key]:
                    items = representation[phase][step][ipo_key][element]
                    ### cluster the descriptions
                    ### TODO: can do this across each level for each type of info and see the overlap???
                    context = f"{phase} -> {step} -> {ipo_key} -> {element}"
                    library = []
                    for information_key in INFORMATION_KEYS:
                        ### some are str and some are list[str]
                        actual_items = []
                        for item in items:
                            if item["type"] == information_key:
                                actual_items.append({
                                    "url": item["url"],
                                    "content": item["content"]
                                })
                        library.extend(
                            aggregate_information(actual_items, information_key, context)
                        )
                    ### sort library elements in the decreasing order of the number of items
                    library.sort(key=lambda x: len(x["items"]), reverse=True)
                    representation[phase][step][ipo_key][element] = library
                        
    ### save the representation
    with open(representation_filepath, "w") as f:
        json.dump(representation, f, indent=4)
    return representation

### Distribution

In [5]:
### Analysis

import numpy as np
import matplotlib.pyplot as plt

def analysis_count_summary(
    representation, tutorial_urls, url_to_nice_name,
    key_levels_to_draw=[],
    information_keys_to_draw=[],
):
    def get_statistics_per_information_key(counter_list):
        """
        calculate the average and std
        """
        results = {}
        for information_key in INFORMATION_KEYS:
            results[information_key] = {}
            statistics = {}
            for counter in counter_list:
                if information_key not in counter:
                    continue
                for key in counter[information_key]:
                    if key not in statistics:
                        statistics[key] = []
                    statistics[key].append(counter[information_key][key])

            for key in statistics:
                results[information_key][key] = {
                    "sum": sum(statistics[key]),
                    "average": np.mean(statistics[key]),
                    "std": np.std(statistics[key]),
                }
        return results
    
    all_statistics = []
    procedure_counter = []
    for phase in representation:
        phase_counter = []
        for step in representation[phase]:
            step_counter = []
            for ipo_key in representation[phase][step]:
                ipo_key_counter = []
                for element in representation[phase][step][ipo_key]:
                    element_counter = {}
                    for information_key in INFORMATION_KEYS:
                        per_information_key = []
                        for info in representation[phase][step][ipo_key][element]:
                                if info["type"] == information_key:
                                    per_information_key.append(info)

                        clustered = len(per_information_key)
                        
                        ### total number of information pieces
                        total = 0
                        for rep in per_information_key:
                            total += len(rep["items"])

                        ### presence of information in each tutorial
                        tutorial_coverages = {}
                        for url in tutorial_urls:
                            tutorial_coverages[url] = 0
                        for rep in per_information_key:
                            for item in rep["items"]:
                                if item["url"] not in tutorial_urls:
                                    continue
                                tutorial_coverages[item["url"]] = 1
                            
                        element_counter[information_key] = {
                            "clustered": clustered,
                            "total": total,
                            **tutorial_coverages,
                        }
                    cur_key = f"r.{phase}.{step}.{ipo_key}.{element}"
                    all_statistics.append({
                        "key": cur_key,
                        "statistics": get_statistics_per_information_key([element_counter]),
                    })
                    ipo_key_counter.append(element_counter)
                cur_key = f"r.{phase}.{step}.{ipo_key}"
                all_statistics.append({
                    "key": cur_key,
                    "statistics": get_statistics_per_information_key(ipo_key_counter),
                })
                step_counter.extend(ipo_key_counter)
            cur_key = f"r.{phase}.{step}"
            all_statistics.append({
                "key": cur_key,
                "statistics": get_statistics_per_information_key(step_counter),
            })
            phase_counter.extend(step_counter)
        cur_key = f"r.{phase}"
        all_statistics.append({
            "key": cur_key,
            "statistics": get_statistics_per_information_key(phase_counter),
        })
        procedure_counter.extend(phase_counter)
    all_statistics.append({
        "key": "r",
        "statistics": get_statistics_per_information_key(procedure_counter),
    })

    meaningful_stats = []
    for information_key in INFORMATION_KEYS:
        for all_stat in all_statistics:
                statistics = all_stat["statistics"]
                key = all_stat["key"]
                if information_key not in statistics:
                    continue
                
                cur_stats = {}
                for url in tutorial_urls:
                    total_info = statistics[information_key]["total"]["sum"]
                    clustered_info = statistics[information_key]["clustered"]["sum"]
                    if url not in statistics[information_key] or statistics[information_key][url]["sum"] == 0:
                        cur_stats[url] = {
                            "coverage": 0,
                            "of_total": 0,
                            "total": total_info,
                            "of_clustered": 0,
                            "clustered": clustered_info,
                        }
                        continue

                    point_coverage = statistics[information_key][url]["sum"]
                    cur_stats[url] = {
                        "coverage": point_coverage,
                        "of_total": round(point_coverage / total_info * 100, 2),
                        "total": total_info,
                        "of_clustered": round(point_coverage / clustered_info * 100, 2),
                        "clustered": clustered_info,
                    } 
                meaningful_stats.append({
                    "key": key,
                    "information_key": information_key,
                    "coverages": cur_stats,
                })

    def filter_by_key_level(key_level, information_key):
        num_of_dots = KEY_LEVELS[key_level]
        filtered = []
        for stat in meaningful_stats:
            if len(stat["key"].split(".")) != num_of_dots + 1:
                continue
            if information_key != stat["information_key"]:
                continue
            filtered.append(stat)
        return filtered

    ### draw bar chart for each key level & information key where each bar is a url and the height is the coverage `of_clustered`

    data_to_draw = {}
    for key_level in KEY_LEVELS:
        for information_key in INFORMATION_KEYS:
            filtered = filter_by_key_level(key_level, information_key)
            per_tutorial = {}
            max_clustered = []
            for stat in filtered:
                clustered = None
                for url in tutorial_urls:
                    if url not in stat["coverages"]:
                        continue
                    cropped_url = url_to_nice_name[url]
                    if cropped_url not in per_tutorial:
                        per_tutorial[cropped_url] = []
                    per_tutorial[cropped_url].append(stat["coverages"][url]["of_clustered"])
                    if clustered is not None and clustered != stat["coverages"][url]["clustered"]:
                        print(f"clustered is not the same for {url}")
                    clustered = stat["coverages"][url]["clustered"]
                max_clustered.append(clustered)
            for url in per_tutorial:
                per_tutorial[url] = np.mean(per_tutorial[url])
                
            
            data_to_draw[f"{key_level}.{information_key}"] = {
                "data": per_tutorial,
                "type": "percentage",
                "total": 100,
                # "type": "count",
                # "total": max_clustered,
                "add": f"L={len(max_clustered)}; {round(np.mean(max_clustered), 0)} Â± {round(np.std(max_clustered), 0)}",
            }

    fig, axs = plt.subplots(len(key_levels_to_draw), len(information_keys_to_draw), figsize=(15, 15))
    ### add padding between plots
    fig.subplots_adjust(wspace=0.5, hspace=1)

    for lvl_idx, key_level in enumerate(key_levels_to_draw):
        for info_idx, information_key in enumerate(information_keys_to_draw):
            key = f"{key_level}.{information_key}"
            data = data_to_draw[key]["data"]
            bar_type = data_to_draw[key]["type"]
            total = data_to_draw[key]["total"]
            add = data_to_draw[key]["add"]

            # bar_keys = list(data.keys())
            # bar_keys = sorted(bar_keys, key=lambda x: int(x[1:]))
            # bar_values = [data[bar_key] for bar_key in bar_keys]
            # axs[lvl_idx, info_idx].bar(bar_keys, bar_values)


            ### probably better to draw the distribution of # of tutorials for a particular coverage bracket
            bracket_size = 1 #percent
            bracketed_data = {}
            for bracket_idx in range(0, 20):
                bracketed_data[bracket_idx * bracket_size] = 0
            for url in data:
                coverage = data[url]
                bracket_id = int(coverage / bracket_size) * bracket_size
                if bracket_id not in bracketed_data:
                    bracketed_data[bracket_id] = 0
                bracketed_data[bracket_id] += 1

            bracketed_keys = list(bracketed_data.keys())
            bracketed_keys = sorted(bracketed_keys)
            bracketed_values = [bracketed_data[bracketed_key] for bracketed_key in bracketed_keys]

            axs[lvl_idx, info_idx].bar(bracketed_keys, bracketed_values)
            axs[lvl_idx, info_idx].set_title(key)
            axs[lvl_idx, info_idx].set_ylim(0, len(tutorial_urls))
            # axs[lvl_idx, info_idx].set_ylim(0, 5)

            axs[lvl_idx, info_idx].text(0.95, 0.95, f"{add}", ha="right", va="top", transform=axs[lvl_idx, info_idx].transAxes)
            ### reduce the size of x labels
            axs[lvl_idx, info_idx].tick_params(axis='x', labelsize=10)
            ### make x label intervals bracket_size
            axs[lvl_idx, info_idx].set_xticks(bracketed_keys)
            axs[lvl_idx, info_idx].set_xticklabels(bracketed_keys)
            axs[lvl_idx, info_idx].set_xlabel(f"{bar_type} coverage")
            

    fig.show()

    return meaningful_stats

In [6]:
### MAIN
def analyze_distribution(representation, tutorial_urls, url_to_nice_name):
    stats = analysis_count_summary(
        representation, tutorial_urls, url_to_nice_name,
        # key_levels_to_draw=list(KEY_LEVELS.keys())[3:5],
        key_levels_to_draw=list(KEY_LEVELS.keys()),
        information_keys_to_draw=PRIMARY_INFORMATION_KEYS,
    )

    stats = analysis_count_summary(
        representation, tutorial_urls, url_to_nice_name,
        # key_levels_to_draw=list(KEY_LEVELS.keys())[3:5],
        key_levels_to_draw=list(KEY_LEVELS.keys()),
        information_keys_to_draw=SUPP_INFORMATION_KEYS,
    )

### Visualize

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import networkx as nx

from sklearn.metrics import pairwise_distances
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from umap import UMAP

def extract_subset_tutorial_representation(representation, urls):
    result = {}
    for phase in representation:
        result[phase] = {}
        for step in representation[phase]:
            result[phase][step] = {}
            for ipo_key in representation[phase][step]:
                result[phase][step][ipo_key] = {}
                for element in representation[phase][step][ipo_key]:
                    result[phase][step][ipo_key][element] = []
                    reps = representation[phase][step][ipo_key][element]
                    for rep in reps:
                        should_append = False
                        for item in rep["items"]:
                            if item["url"] in urls:
                                should_append = True
                                break
                        if should_append:
                            result[phase][step][ipo_key][element].append(rep)
    return result

def extract_tutorial_representation(representation, url):
    return extract_subset_tutorial_representation(representation, [url])

def edit_distance(reps_1, reps_2, information_key):
    contents_1 = []
    contents_2 = []
    for rep in reps_1:
        if rep["type"] == information_key:
            contents_1.append(rep["content"])
    for rep in reps_2:
        if rep["type"] == information_key:
            contents_2.append(rep["content"])

    ### remove dups
    prev_len = len(contents_1)
    contents_1 = list(set(contents_1))
    if len(contents_1) != prev_len:
        print("Found duplicate info pieces when calculating edit distance")
    contents_2 = list(set(contents_2))

    ### edit distance
    if len(contents_1) + len(contents_2) == 0:
        return 0
    common = 0
    for c1 in contents_1:
        for c2 in contents_2:
            if c1 == c2:
                common += 1
    return len(contents_1) + len(contents_2) - 2 * common


def tutorial_distance(t1_rep, t2_rep, weights_map):
    distance = 0

    for phase in t1_rep:
        for step in t1_rep[phase]:
            ### can try reflecting the importance of a step/phase but later;
            for ipo_key in t1_rep[phase][step]:
                ### can try different weights for input/method/output
                for element in t1_rep[phase][step][ipo_key]:
                    reps_1 = t1_rep[phase][step][ipo_key][element]
                    reps_2 = t2_rep[phase][step][ipo_key][element]
                    for information_key in PRIMARY_INFORMATION_KEYS:
                        distance += edit_distance(reps_1, reps_2, information_key) * weights_map["primary"]
                    for information_key in SUPP_INFORMATION_KEYS:
                        distance += edit_distance(reps_1, reps_2, information_key) * weights_map["supp"]
    return distance

def tutorial_distance_matrix(representation, tutorial_urls, weights_map):
    """
    calculate the distance matrix between each tutorial
    """

    t_reps = []
    for url in tutorial_urls:
        t_rep = extract_tutorial_representation(representation, url)
        t_reps.append(t_rep)

    distance_matrix = []
    for i in range(len(tutorial_urls)):
        distance_matrix.append([0] * len(tutorial_urls))
    
    for i in range(len(tutorial_urls)):
        for j in range(i + 1, len(tutorial_urls)):
            distance = tutorial_distance(t_reps[i], t_reps[j], weights_map)
            distance_matrix[i][j] = distance
            distance_matrix[j][i] = distance
    return distance_matrix

def visualize_tutorial_comparison(representation, url_1, url_2, bar_chart_level="phase", only_primary=True):
    """
    compare the representations of two tutorials
    """
    t1_rep = extract_tutorial_representation(representation, url_1)
    t2_rep = extract_tutorial_representation(representation, url_2)

    ### plot bar chart based on `bar_chart_level` (e.g., phase)
    bar_chart_data = {}
    diff = {}
    for phase in representation:
        diff[phase] = {}
        for step in representation[phase]:
            diff[phase][step] = {}
            for ipo_key in representation[phase][step]:
                diff[phase][step][ipo_key] = {}
                for element in representation[phase][step][ipo_key]:

                    reps = representation[phase][step][ipo_key][element]
                    reps_1 = t1_rep[phase][step][ipo_key][element]
                    reps_2 = t2_rep[phase][step][ipo_key][element]
                    
                    diff[phase][step][ipo_key][element] = {}

                    for information_key in INFORMATION_KEYS:
                        if only_primary is True and information_key not in PRIMARY_INFORMATION_KEYS:
                            continue
                        per_piece = {}
                        
                        for rep in reps:
                            if rep["type"] == information_key:
                                per_piece[rep["content"]] = 0
                        for rep in reps_1:
                            if rep["type"] == information_key:
                                per_piece[rep["content"]] = 1
                        for rep in reps_2:
                            if rep["type"] != information_key:
                                continue
                            if per_piece[rep["content"]] == 1:
                                per_piece[rep["content"]] = 3
                            else:    
                                per_piece[rep["content"]] = 2
                        diff[phase][step][ipo_key][element][information_key] = per_piece

                        bc_key = None
                        if bar_chart_level == "phase":
                            bc_key = phase
                        if bar_chart_level == "step":
                            bc_key = step
                        if bar_chart_level == "ipo":
                            bc_key = ipo_key
                        if bar_chart_level == "element":
                            bc_key = element
                        if bar_chart_level == "information":
                            bc_key = information_key
                        if bc_key not in bar_chart_data:
                            bar_chart_data[bc_key] = {}
                        for piece in per_piece:
                            cur_id = f"{per_piece[piece]}"
                            if cur_id not in bar_chart_data[bc_key]:
                                bar_chart_data[bc_key][cur_id] = 0
                            bar_chart_data[bc_key][cur_id] += 1

    ### draw a composition bar chart
    # 0 -> black (absent from both),
    # 1 -> blue (only in tutorial 1),
    # 2 -> red (only in tutorial 2),
    # 3 -> yellow (present in both)
    fig = plt.figure(figsize=(5, 5))
    keys = list(bar_chart_data.keys())
    values = []
    for key in keys:
        zero = bar_chart_data[key]['0'] if '0' in bar_chart_data[key] else 0
        one = bar_chart_data[key]['1'] if '1' in bar_chart_data[key] else 0
        two = bar_chart_data[key]['2'] if '2' in bar_chart_data[key] else 0
        three = bar_chart_data[key]['3'] if '3' in bar_chart_data[key] else 0
        total = zero + one + two + three
        zero /= total
        one /= total
        two /= total
        three /= total
        values.append([zero, one, two, three])
    values = np.array(values)
    bar_width = 0.35
    x = np.arange(len(keys))
    plt.bar(x, values[:, 3], width=bar_width, color='yellow', label='both')
    plt.bar(x, values[:, 1], width=bar_width, color='blue', label='only in 1', bottom=values[:, 3])
    plt.bar(x, values[:, 2], width=bar_width, color='red', label='only in 2', bottom=values[:, 3] + values[:, 1])
    # plt.bar(x, values[:, 0], width=bar_width, color='black', label='0', bottom=values[:, 3] + values[:, 1] + values[:, 2])
    plt.ylim(0, 1.0)
    plt.xticks(x, keys, rotation=45)
    plt.xlabel(bar_chart_level)
    plt.ylabel('Count')
    plt.legend()
    plt.show()
    
    print()

def visualize_tutorial_embeddings(distance_matrix, labels, labels_to_show, top_k=3, k_means_clusters_n=5, method="tsne"):
    """
    visualize the tutorials in 2D space based on the pairwise distances
    distance_matrix: a matrix of size (n, n) where n is the number of tutorials and values are floats
    """
    ## scale the distance matrix to range [0,1]
    # Make symmetric by averaging with transpose
    distance_matrix = np.array(distance_matrix)
    distance_matrix = (distance_matrix + distance_matrix.T) / 2
    
    # Scale distances to [0,1] range
    max_dist = np.max(distance_matrix)
    if max_dist > 0:  # Avoid division by zero
        scaled_distances = distance_matrix / max_dist
    else:
        scaled_distances = distance_matrix

    result = []
    # Perform t-SNE
    if method == "tsne":
        tsne = TSNE(n_components=2, random_state=42, method="exact",)
        result = tsne.fit_transform(scaled_distances)

    # Perform UMAP
    if method == "umap":
        umap = UMAP(n_components=2, random_state=42, metric="precomputed")
        result = umap.fit_transform(scaled_distances)
    
    # Perform MDS
    if method == "mds":
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
        result = mds.fit_transform(distance_matrix)

    # k-means clustering
    kmeans = KMeans(n_clusters=k_means_clusters_n, random_state=42)
    kmeans.fit(result)
    k_means_labels = kmeans.labels_
    colors = matplotlib.colormaps.get_cmap("tab10")
    
    clusters = {}
    for i in range(len(result)):
        if k_means_labels[i] not in clusters:
            clusters[k_means_labels[i]] = []
        clusters[k_means_labels[i]].append(i)
    for cluster in clusters:
        print(f"Cluster {cluster}: {clusters[cluster]}")

    ## visualize
    plt.figure(figsize=(5, 5))
    plt.title(f"{method}")
    for i in range(len(result)):
        plt.scatter(result[i][0], result[i][1], color=colors(k_means_labels[i]))
        if i in labels_to_show and len(labels_to_show) > 0:
            plt.annotate(i, (result[i][0], result[i][1]), fontsize=8)
    plt.legend(loc="upper right", fontsize=8)
    plt.show()

    def eucledian(a, b):
        return np.sqrt(np.sum((a - b) ** 2))

    comparison_candidates = []
    ### print k closest tutorials
    for i in range(len(result)):
        if i not in labels_to_show:
            continue
        distances = []
        for j in range(len(result)):
            if i == j:
                continue
            distances.append((j, eucledian(result[i], result[j])))
        distances = sorted(distances, key=lambda x: x[1])
        pick = [(0, 'closest'), (-1, 'farthest')]
        # pick = [(6, "7th closest"), (8, '9th closest'), (9, '10th closest'), (10, 'farthest')]
        for idx, label in pick:
            j = distances[idx][0]
            dist = distances[idx][1]
            comparison_candidates.append((i, j, label, dist, result[i], result[j]))
    if top_k == -1:
        return comparison_candidates
    comparison_candidates = sorted(comparison_candidates, key=lambda x: x[3])
    if len(comparison_candidates) < top_k * 2:
        return comparison_candidates
    return comparison_candidates[0:top_k] + comparison_candidates[-top_k:]


def visualize_representation(representation, information_keys, tutorial_urls, show_all=False, optimize_sum_of_edges=False):
    cooc = []
    count_labels = []
    for pidx, phase in enumerate(representation.keys()):
        for sidx, step in enumerate(representation[phase].keys()):
            for ipo in ["inputs", "methods", "outputs"]:
                for eidx, element in enumerate(representation[phase][step][ipo].keys()):
                    coverage = {}
                    for url in tutorial_urls:
                        coverage[url] = 0
                    cur_count_labels = []
                    for ridx, rep in enumerate(representation[phase][step][ipo][element]):
                        if rep["type"] not in information_keys:
                            continue
                        # for item in rep["items"]:
                        #     count_labels.append({
                        #         "idx": len(count_labels),
                        #         "phase": phase,
                        #         "step": step,
                        #         "ipo": ipo,
                        #         "element": element,
                        #         "type": rep["type"],
                        #         "urls": [item["url"]],
                        #         "content": item["content"]
                        #     })
                        cur_urls = []
                        for item in rep["items"]:
                            if item['url'] in tutorial_urls:
                                cur_urls.append(item['url'])
                                coverage[item['url']] += 1
                        if len(cur_urls) == 0 and show_all is False:
                            continue
                        cur_idx = f"{pidx}-{sidx}-{ipo[0]}-{eidx}-{ridx}"
                        cur_count_labels.append({
                            "idx": cur_idx,
                            "phase": phase,
                            "step": step,
                            "ipo": ipo,
                            "element": element,
                            "type": rep["type"],
                            "urls": cur_urls,
                            "content": rep["content"]
                        })
                    if len(cur_count_labels) == 0:
                        continue
                    empty_urls = []
                    for url in coverage.keys():
                        if coverage[url] > 1:
                            print("Error: multiple representations for the same url:", url)
                        if coverage[url] == 0:
                            empty_urls.append(url)
                    cur_idx = f"{pidx}-{sidx}-{ipo[0]}-{eidx}-e"
                    count_labels.append({
                        "idx": cur_idx,
                        "phase": phase,
                        "step": step,
                        "ipo": ipo,
                        "element": element,
                        "type": "empty",
                        "urls": empty_urls,
                        "content": "<empty>"
                    })
                    count_labels.extend(cur_count_labels)
            # ## todo: remove this
            # break
        # ### todo: remove this
        # break
    
    per_element_cooc = {}
    per_element_layer_ids = {}
    for l in count_labels:
        label = f"{l['phase']}-{l['step']}-{l['ipo']}-{l['element']}"
        if label not in per_element_cooc:
            per_element_cooc[label] = []
            per_element_layer_ids[label] = len(per_element_layer_ids)
        per_element_cooc[label].append(l)
        l["layer"] = per_element_layer_ids[label]
        
    for l1 in count_labels:
        cur_cooc = []
        for l2 in count_labels:
            if l1["idx"] == l2["idx"]:
                cur_cooc.append((0, ""))
                continue
            ### continue if layers are not adjacent
            if l1["layer"] != l2["layer"] - 1:
                cur_cooc.append((0, ""))
                continue
            ### co-occurences = # of urls in both l1 and l2
            cur = 0
            url_concat = ""
            for url in l1["urls"]:
                if url in l2["urls"]:
                    cur += 1
                    url_concat += f"{url.split('=')[-1]}+"
            cur_cooc.append((cur, url_concat))
        cooc.append(cur_cooc)


    ### TODO: optimize the sum of edges (i.e., technically we only preserve the edges between adjacent layers, so we need to choose the most optimal arrangement of layers to maximize the sum of edges). 
    ### one restriction is that we can only swap layers that belong to the same `phase`-`step`-`ipo` and global arrangement of layer blocks should be preserved.
    ### we can brute force all possible arrangements since the number of permutations within a phase-step-ipo group is at most 6 (24)


    node_color_map = {
        "inputs": (0, 0.5, 0), # green
        "methods": (0, 0, 0.5), # blue
        "outputs": (0.5, 0, 0), # red
        "empty": (0.5, 0.5, 0.5), # gray
    }

    plt.figure(figsize=(150, 20))
    G = nx.Graph()
    for element_label in per_element_cooc.keys():
        x = per_element_layer_ids[element_label]
        for y, l in enumerate(per_element_cooc[element_label]):
            ratio_of_urls = len(l["urls"]) / len(tutorial_urls)
            label = f"{l['ipo'][0]}-{l['element']}-{l['content']}"
            node_color = node_color_map[l['ipo']]
            if l['type'] == "empty":
                node_color = node_color_map["empty"]
            label_dis = "".join(l["idx"].split("-")[-3:])
            G.add_node(
                l["idx"], 
                label=label, 
                pos=(x, y), 
                label_dis=label_dis, 
                node_color=node_color, 
                # node_linewidth=ratio_of_urls, 
                node_size=ratio_of_urls*2000,
                percentage=ratio_of_urls*100
            )
            ### write the `percentage` at the top of the node
            plt.text(x, y + 0.1, f"{(ratio_of_urls*100):.1f}%", fontsize=10, ha='center', va='center')
        ### write text at (x, -1), adaptively wrap the text if too long
        lines = element_label.split("-")
        plt.text(x, -0.8, lines[0], fontsize=7, ha='center', va='center')
        plt.text(x, -0.6, lines[1], fontsize=7, ha='center', va='center')
        plt.text(x, -0.4, lines[2], fontsize=10, ha='center', va='center')
        plt.text(x, -0.2, lines[3], fontsize=7, ha='center', va='center')
    
    max_weight = 0
    for i in range(len(cooc)):
        for j in range(i + 1, len(cooc)):
            if cooc[i][j][0] > 0:
                G.add_edge(
                    count_labels[i]["idx"],
                    count_labels[j]["idx"],
                    weight=cooc[i][j][0],
                    label_dis=f"{cooc[i][j][0]}",
                    label=f"{cooc[i][j][0]/len(tutorial_urls)*100:.1f}%",
                )
                max_weight = max(max_weight, cooc[i][j][0])
    
    ### adjust edge opacity based on weight
    for i, j, data in G.edges(data=True):
        data["label_dis"] = f"{data['label']}"
        data['width'] = (data['weight']-1) / max_weight * 10 + 1
        
    nx.draw(G,
        pos=nx.get_node_attributes(G, 'pos'),
        with_labels=True,
        labels=nx.get_node_attributes(G, 'label_dis'),
        node_color=list(nx.get_node_attributes(G, 'node_color').values()),
        width=list(nx.get_edge_attributes(G, 'width').values()),
        node_size=list(nx.get_node_attributes(G, 'node_size').values()),
        # node_linewidth=list(nx.get_node_attributes(G, 'node_linewidth').values())
    )
    # nx.draw_networkx_edge_labels(G,
    #     edge_labels=nx.get_edge_attributes(G, 'label_dis'),
    #     pos=nx.get_node_attributes(G, 'pos'),
    # )

    plt.show()

In [8]:
### MAIN
### Visualization Experiments

def visualization_experiments(representation, tutorial_urls):
    weights_map = {
        ### info
        "primary": 1, ### edit_distance: edits / (len(t1) + len(t2))
        "supp": 0, ### edit distance: edits / (len(t1) + len(t2))
    }

    distance_matrix = tutorial_distance_matrix(representation, tutorial_urls, weights_map)
    to_show = [31, 42, 46, 47, 63, 91, 92]
    # empties = [16, 17, 21, 23, 32, 48, 50, 51, 57, 93]
    # close_to_empty = [31, 42, 46, 47, 63, 91, 92]
    # to_show = [8, 13, 14, 41, 65, 69, 73, 77]
    # to_show = [0, 10, 15, 22, 29, 36, 52, 58, 74, 86, 95, 97]
    # to_show = [1, 4, 26, 44, 55, 56, 60, 61, 75, 85, 89, 96, 98]
    # to_show = [2, 6, 11, 12, 40, 53, 64, 67, 68, 79, 90, 94]
    # to_show = [7, 19, 33, 34, 35, 38, 39, 62, 66, 71, 72, 78, 80, 83, 84]
    # far_from_empty = [3, 9, 18, 20, 24, 27, 37, 43, 45, 49, 54, 70, 82, 87]

    n_clusters = 3
    comparison_pairs = []
    # comparison_pairs = visualize_tutorials(distance_matrix, tutorial_urls, to_show, -1, n_clusters, method="tsne")
    # comparison_pairs = visualize_tutorials(distance_matrix, tutorial_urls, to_show, -1, n_clusters, method="mds")
    # comparison_pairs = visualize_tutorials(distance_matrix, tutorial_urls, to_show, -1, n_clusters, method="umap")

    # close_to_empty = [31, 42, 46, 47, 63, 91, 92]
    # compare_subset_tutorial_representations(representation, [tutorial_urls[i] for i in to_show_1], "step", True)

    # subset_rep = extract_subset_tutorial_representation(representation, [tutorial_urls[i] for i in to_show])
    # print(json.dumps(subset_rep, indent=4))

    # for pair in comparison_pairs[:10]:
    #     t1 = tutorial_urls[pair[0]]
    #     t2 = tutorial_urls[pair[1]]
    #     kind = pair[2]
    #     dist = pair[3]
    #     pos1 = pair[4]
    #     pos2 = pair[5]
    #     print (f"Comparing {pair[0]} and {pair[1]} ({kind}) with distance {dist} at position {pos1} and {pos2}")
    #     compare_tutorial_representations(representation, t1, t2, "element", True)

    visualize_representation(representation, PRIMARY_INFORMATION_KEYS, tutorial_urls)
    # # visualize_representation(representation, [PRIMARY_INFORMATION_KEYS[0]], tutorial_urls)
    # visualize_representation(representation, [PRIMARY_INFORMATION_KEYS[1]], tutorial_urls)

    # for url in tutorial_urls:
    #     print(f"showing {url}")
    #     visualize_representation(representation, PRIMARY_INFORMATION_KEYS, [url], show_all=True)

    # for i in range(len(tutorial_urls)):
    #     for j in range(i + 1, len(tutorial_urls)):
    #         print(f"Comparing {tutorial_urls[i]} and {tutorial_urls[j]}")
    #         visualize_representation(representation, PRIMARY_INFORMATION_KEYS, [tutorial_urls[i], tutorial_urls[j]], show_all=True)
    #     break
    

### Fragmentation

In [9]:
from helpers.bert import bert_embedding

def print_md_table(data):
    """
    data is a dict of form fragmentation[focus][distance_type]
    """
    print("| Focus | Fine | Coarse | Breadth |")
    print("|-------|------|--------|--------|")
    for focus in data:
        fine = data[focus]['fine']
        coarse = data[focus]['coarse']
        breadth = ""
        if "breadth" in data[focus]:
            breadth = data[focus]["breadth"]
        print(f"| {focus} | {fine:.2f} | {coarse:.2f} | {breadth}")

def calc_fragmentation_coarse(representation, tutorial_urls, ipo_weights):
    pass

def calc_fragmentation_pairwise(representation, tutorial_urls, ipo_weights, information_keys, distance_type="fine"):

    def calc_option_distance(option_1, option_2):
        if distance_type == "fine":
            return np.dot(option_1, option_2)
        elif distance_type == "coarse":
            if option_1 == option_2:
                return 0
            else:
                return 1
        else:
            raise ValueError(f"Invalid distance type: {distance_type}")

    total = 0
    fragmentation = 0
    for phase in representation:
        for step in representation[phase]:
            for ipo in representation[phase][step]:
                for element in representation[phase][step][ipo]:
                    options = {}
                    for tutorial_url in tutorial_urls:
                        options[tutorial_url] = ""
                    for rep in representation[phase][step][ipo][element]:
                        if rep["type"] in information_keys:
                            for item in rep["items"]:
                                if distance_type == "fine":
                                    options[item["url"]] = item["content"]
                                elif distance_type == "coarse":
                                    options[item["url"]] = rep["content"]
                                else:
                                    raise ValueError(f"Invalid distance type: {distance_type}")
                    if distance_type == "fine":
                        for ti in options.keys():
                            options[ti] = bert_embedding([options[ti]])[0]
                    ### calculate pairwise distances
                    for ti in options.keys():
                        for tj in options.keys():
                            if ti == tj:
                                continue
                            dist = calc_option_distance(options[ti], options[tj])
                            total += ipo_weights[ipo]
                            fragmentation += dist * ipo_weights[ipo]
    return fragmentation / total

def calc_breadth(representation, tutorial_urls, ipo_weights, information_keys):
    per_step_breadth = {}
    for phase in representation:
        for step in representation[phase]:
            if step not in per_step_breadth:
                per_step_breadth[step] = 1
            for ipo in representation[phase][step]:
                if ipo_weights[ipo] < 1:
                    continue
                for element in representation[phase][step][ipo]:
                    options = {}
                    for tutorial_url in tutorial_urls:
                        options[tutorial_url] = ""
                    cur_breadth = len(representation[phase][step][ipo][element])
                    for rep in representation[phase][step][ipo][element]:
                        if rep["type"] in information_keys:
                            for item in rep["items"]:
                                options[item["url"]] = rep["content"]
                    for ti in options.keys():
                        if options[ti] == "":
                            cur_breadth += 1
                    per_step_breadth[step] *= cur_breadth

    vals = np.array(list(per_step_breadth.values()))
    avg = np.average(vals)
    std = np.std(vals)
    return avg, std

In [10]:
### MAIN
def analyze_fragmentation(representation, tutorial_urls):
    proc_fragmentation = {}
    for i_weights in [0, 1]:
        for m_weights in [0, 1]:
            for o_weights in [0, 1]:
                ipo_weights = {
                    "inputs": i_weights,
                    "methods": m_weights,
                    "outputs": o_weights,
                }
                focus = ""
                if i_weights == 1:
                    focus += "inputs-"
                if m_weights == 1:
                    focus += "methods-"
                if o_weights == 1:
                    focus += "outputs-"
                if focus == "":
                    continue
                if focus not in proc_fragmentation:
                    proc_fragmentation[focus] = {}
                for distance_type in ["fine", "coarse"]:
                    fragmentation = calc_fragmentation_pairwise(representation, tutorial_urls, ipo_weights, PRIMARY_INFORMATION_KEYS, distance_type)
                    proc_fragmentation[focus][distance_type] = fragmentation

    print(json.dumps(proc_fragmentation, indent=4))

    info_fragmentation = {}
    for information_keys in [SUPP_INFORMATION_KEYS, [SUPP_INFORMATION_KEYS[0]], [SUPP_INFORMATION_KEYS[1]]]:
        ipo_weights = {
            "inputs": 0,
            "methods": 1,
            "outputs": 0,
        }
        focus = ""
        for i in information_keys:
            focus += i + "-"
        if focus == "":
            continue
        if focus not in info_fragmentation:
            info_fragmentation[focus] = {}
        for distance_type in ["fine", "coarse"]:
            fragmentation = calc_fragmentation_pairwise(representation, tutorial_urls, ipo_weights, information_keys, distance_type)
            info_fragmentation[focus][distance_type] = fragmentation

    print(json.dumps(info_fragmentation, indent=4))

    for i_weights in [0, 1]:
        for m_weights in [0, 1]:
            for o_weights in [0, 1]:
                ipo_weights = {
                    "inputs": i_weights,
                    "methods": m_weights,
                    "outputs": o_weights,
                }
                focus = ""
                if i_weights == 1:
                    focus += "inputs-"
                if m_weights == 1:
                    focus += "methods-"
                if o_weights == 1:
                    focus += "outputs-"
                if focus == "":
                    continue
                if focus not in proc_fragmentation:
                    proc_fragmentation[focus] = {}
                avg, std = calc_breadth(representation, tutorial_urls, ipo_weights, PRIMARY_INFORMATION_KEYS)
                proc_fragmentation[focus]["breadth"] = f"avg={avg:.2f}, std={std:.2f}"

    print_md_table(proc_fragmentation)
    print()
    print_md_table(info_fragmentation)

# Main

In [11]:
MUFFIN_TASK = "Making Muffins"

"""
Make French Toast			10 steps / 272 videos
Make Irish Coffee			5 steps / 248 videos
Change a Tire				11 steps / 119 videos
Build (sim.) Floating Shelves		5 steps / 173 videos

"""
CROSS_TASK_TASKS = [
    "Change a Tire",
    "Build (sim.) Floating Shelves",
    "Make French Toast",
    "Make Irish Coffee",
]

def get_dataset(task):
    if task == MUFFIN_TASK:
        return get_dataset_muffins(task, "raw")
    else:
        return preprocess_cross_task(task, "raw")

def main(task, agg_approach="llm-based", agg_distance_threshold=0.3):
    dataset = get_dataset(task)
    taxonomy = construct_step_taxonomy(dataset, task, "stupid")
    dataset = extract_ipos_stupid(dataset, taxonomy, task, "stupid")
    ipo_taxonomy = taxonomize_ipos(dataset, taxonomy, task, "stupid")
    dataset = extract_information_per_ipo(dataset, taxonomy, ipo_taxonomy, task, "stupid")
    representation = extract_task_representation(
        dataset, taxonomy, ipo_taxonomy, task,
        agg_approach, agg_distance_threshold,
        "stupid-" + agg_approach
    )

    print("pre-process done!")

    ## todo: remove (since don't want to rerun the entire representation building again)
    for phase in representation:
        for step in representation[phase]:
            for ipo_key in representation[phase][step]:
                for element in representation[phase][step][ipo_key]:
                    representation[phase][step][ipo_key][element] = sorted(representation[phase][step][ipo_key][element], key=lambda x: len(x["items"]), reverse=True)
    ### end-todo

    url_to_content = {}
    url_to_nice_name = {}
    for idx, article in enumerate(dataset):
        url_to_content[article["url"]] = article["content"]
        url_to_nice_name[article["url"]] = f"T{idx}"

    tutorial_urls = list(url_to_content.keys())
    return representation, tutorial_urls, url_to_nice_name

In [14]:
import random

task = MUFFIN_TASK
# task = CROSS_TASK_TASKS[0]
# task = CROSS_TASK_TASKS[1]

representation, tutorial_urls, url_to_nice_name = main(task, agg_approach="llm-based")
# representation, tutorial_urls, url_to_nice_name = main(task, agg_approach="clustering", agg_distance_threshold=0.3)

# analyze_distribution(representation, tutorial_urls, url_to_nice_name)
# analyze_fragmentation(representation, tutorial_urls)
# visualization_experiments(representation, tutorial_urls)

pre-process done!


In [27]:
### sample subgoal-ipo pairs for analysis
subgoal_list = {}
element_list = {}
for phase in representation:
    for step in representation[phase]:
        subgoal_list[step] = {}
        for ipo in representation[phase][step]:
            if ipo not in subgoal_list[step]:
                subgoal_list[step][ipo] = {}
            
            if ipo not in element_list:
                element_list[ipo] = []
            for element in representation[phase][step][ipo]:
                element_list[ipo].append([phase, step, element, representation[phase][step][ipo][element]])
                
                for entry in representation[phase][step][ipo][element]:
                    if entry["type"] not in subgoal_list[step][ipo]:
                        subgoal_list[step][ipo][entry["type"]] = {}
                    
                    if element not in subgoal_list[step][ipo][entry["type"]]:
                        subgoal_list[step][ipo][entry["type"]][element] = []
                    subgoal_list[step][ipo][entry["type"]][element].append(entry["content"] + f" ({len(entry['items'])})")

### sample 3 steps
selected_steps = random.sample(list(subgoal_list.keys()), 3)
csv_lines = []
max_info_len = 0
for step in selected_steps:
    for ipo in subgoal_list[step]:
        for info_type in subgoal_list[step][ipo]:
            for element in subgoal_list[step][ipo][info_type]:
                csv_lines.append([step, element, f"{ipo} / {info_type}"])
                for info in subgoal_list[step][ipo][info_type][element]:
                    csv_lines[-1].append(info)
                max_info_len = max(max_info_len, len(csv_lines[-1]))

csv_output_path = RESULTS_PATH + f"qual_subgoal_list_{task}.csv"
with open(csv_output_path, "w") as f:
    col_titles = ["Step", "Class", "Ipo / Info Type"]
    while len(col_titles) < max_info_len:
        col_titles.append(f"Item {len(col_titles) - 3}")
    f.write(",".join(col_titles) + "\n")
    for line in csv_lines:
        if len(line) < max_info_len:
            line += [""] * (max_info_len - len(line))
        f.write(",".join(["\"" + x + "\"" for x in line]) + "\n")


csv_lines = []
output = []
for ipo in element_list:
    selected = random.sample(element_list[ipo], 3)
    for selected_element in selected:
        prefix = [selected_element[1], selected_element[2]]
        for entry in selected_element[3]:
            csv_lines.append(prefix + [f"{ipo} / {entry['type']}", entry['content']])
            for item in entry['items']:
                csv_lines[-1].append(item['content'])
            max_info_len = max(max_info_len, len(csv_lines[-1]))

csv_output_path = RESULTS_PATH + f"qual_element_list_{task}.csv"
with open(csv_output_path, "w") as f:
    col_titles = ["Step", "Class", "Ipo / Info Type", "Item"]
    while len(col_titles) < max_info_len:
        col_titles.append(f"Info {len(col_titles) - 4}")
    f.write(",".join(col_titles) + "\n")
    for line in csv_lines:
        if len(line) < max_info_len:
            line += [""] * (max_info_len - len(line))
        f.write(",".join(["\"" + x + "\"" for x in line]) + "\n")
