In [1]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt

from helpers import TASK_DESCRIPTIONS
from preprocess import setup_ds
from helpers.clip import clip_similar_per_text

task_id = list(TASK_DESCRIPTIONS.keys())[0]


ds = setup_ds(task_id)

def checkout_subgoal_summaries(ds):
    for video in ds.videos:
        print("VIDEO: ", video.video_id)
        for subgoal_summary in video.subgoal_summaries:
            print(subgoal_summary["title"], "--", subgoal_summary["outcome"])
            potential_frame_paths = []
            for subgoal in video.common_subgoals:
                if subgoal["title"] == subgoal_summary["title"]:
                    start = int(subgoal["start"])
                    finish = int(subgoal["finish"])
                    for timestamp in range(start, finish):
                        if str(timestamp) in video.frames:
                            potential_frame_paths.append(video.frames[str(timestamp)]["path"])
            cur_frame = subgoal_summary["frame_paths"][0]
            print("Current Frame: ", cur_frame)
            if len(potential_frame_paths) > 0:
                best_frames = clip_similar_per_text([subgoal_summary["outcome"]], potential_frame_paths, top_k=3)[0]
                print(best_frames)
                fig, axs = plt.subplots(1, 1 + len(best_frames))
                img = Image.open(cur_frame)
                axs[0].imshow(img)
                axs[0].set_title("Current Frame")
                for idx, best_frame in enumerate(best_frames):
                    print(f"Best Frame {idx}: ", best_frame)
                    img = Image.open(best_frame)
                    axs[idx + 1].imshow(img)
                    axs[idx + 1].set_title(f"Best Frame {idx}")
                plt.show()
            else:
                ### print only `cur_frame`
                img = Image.open(cur_frame)
                plt.imshow(img)
                plt.title("Current Frame")
                plt.show()
            
        print()
        print()

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


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def checkout_alignments(ds):
    per_subgoal = {}
    for idx, alignment in enumerate(ds.alignments):
        new_video = alignment['new_video']
        prev_video = alignment['prev_video']
        alignments = alignment['alignments']
        title = alignment['title']
        if title not in per_subgoal:
            per_subgoal[title] = []
        per_subgoal[title].append((idx, new_video, prev_video, alignments))

    
    for title, subgoal_alignments in per_subgoal.items():
        print("## SUBGOAL: ", title)
        for idx, new_video, prev_video, alignments in subgoal_alignments:
            print(f"{idx}. **(new) {new_video} -- (prev) {prev_video}**")
            for diff in alignments:
                print("\t- **Description**:", diff['description'])
                print("\t\t- **Quotes (new)**:", "; ".join(diff['new_quotes']))
                print("\t\t- **Quotes (prev)**:", "; ".join(diff['prev_quotes']))
                print()
        print()

In [6]:
import json
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from helpers.embed_and_draw import generate_embeddings, draw_embeddings

def k_means_clustering(
        embeddings,
        n_clusters=3,
):
    kmeans = KMeans(n_clusters=n_clusters, n_init=3, random_state=10).fit(embeddings)

    s_score = silhouette_score(embeddings, kmeans.labels_, metric='euclidean')
    
    return kmeans.labels_, kmeans.inertia_, s_score

def get_representation(
    items=[],
):
    embeddings = []
    contents = []
    labels = []
    colors = []

    for item in items:
        embeddings.append(item["description"])
        contents.append(item["new_quotes"])
        labels.append(item["prev_video"])

    color_per_label = {}
    for idx, label in enumerate(labels):
        if label not in color_per_label:
            color_per_label[label] = np.random.rand(3,)
        colors.append(color_per_label[label])

    return {
        "embeddings": embeddings,
        "contents": contents,
        "labels": labels,
        "colors": colors,
    }


def checkout_k_means_clusters(video_mapping, alignments):
    per_subgoal_and_video = {}
    for alignment in alignments:
        new_video = alignment['new_video']
        prev_video = alignment['prev_video']
        alignments = alignment['alignments']
        title = alignment['title']
        if title not in per_subgoal_and_video:
            per_subgoal_and_video[title] = {}
        if new_video not in per_subgoal_and_video[title]:
            per_subgoal_and_video[title][new_video] = []
        for diff in alignments:
            per_subgoal_and_video[title][new_video].append({
                "prev_video": prev_video,
                **diff,
            })
    
    for title, subgoal_alignments in per_subgoal_and_video.items():
        for new_video, alignments in subgoal_alignments.items():
            ### perform k-means clustering based on `alignment['description']`
            representation = get_representation(alignments)
            embeddings, pca = generate_embeddings(
                representation["embeddings"],
                representation["embeddings"],
                method="bert",
                truncate=0,
            )
            print(f"## SUBGOAL: {title} -- VIDEO: {new_video}")
            # print(f"- **Explained Variance Ratio**: `{pca.explained_variance_ratio_}`")
            # print(f"- **Eigenvalues**: {pca.singular_values_}")
            # print()

            # filename = f"{new_video}_{title.replace(' ', '_')}_bert.png",
            # draw_embeddings(
            #     task_id,
            #     embeddings,
            #     representation["contents"],
            #     representation["labels"],
            #     representation["colors"],
            #     filename,
            # )

            ### perform k-means clustering
            labels = []
            inertia = [0, 0,]
            s_scores = [0, 0, ]
            s_score = None
            for n_clusters in range(2, 7):
                if n_clusters >= len(embeddings):
                    break
                temp_labels, temp_intertia, temp_s_score = k_means_clustering(embeddings, n_clusters)
                inertia.append(temp_intertia)
                s_scores.append(temp_s_score)
                if s_score is None or temp_s_score > s_score:
                    labels = temp_labels
                    s_score = temp_s_score
            
            mapping = {}
            for idx, label in enumerate(labels):
                if label not in mapping:
                    mapping[label] = []
                mapping[label].append(alignments[idx])

            print(f"**# Alignments**:", len(alignments))
            print(f"**# K-Means Clusters**:", len(mapping.keys()))
            print(f"**Inertia**: `{', '.join(str(i) for i in inertia)}`")
            print(f"**Silhouette Score**: `{', '.join(str(i) for i in s_scores)}`")
            new_video_subgoal_summary = video_mapping[new_video].get_subgoal_summary_contents(title)
            new_video_subgoal_summary = "\n".join([text["text"] for text in new_video_subgoal_summary])
            
            #print(f"\n**Subgoal Summary**\n\n{new_video_subgoal_summary}")

            for label, items in mapping.items():
                print(f"#### **Cluster {label}**")

                for item in items:
                    prev_video_subgoal_summary = video_mapping[item["prev_video"]].get_subgoal_summary_contents(title)
                    prev_video_subgoal_summary = "\n".join([text["text"] for text in prev_video_subgoal_summary])
                    #print(f"**Prev Video Summary**\n\n{prev_video_subgoal_summary}")
                    print(f"**Title**: {item['title']}\n")
                    print(f"- **Description**: {item['description']}")
                    print(f"- **Aspects**: {', '.join(item['different_aspects'])}")
                    print(f"- **New Quotes**: {', '.join(item['new_quotes'])}")
                    print(f"- **Prev Quotes**: {', '.join(item['prev_quotes'])}")
                    print()
            print()

video_mapping = {}
for video in ds.videos:
    video_mapping[video.video_id] = video

print("# OUR APPROACH")
checkout_k_means_clusters(video_mapping, ds.alignments)

# OUR APPROACH
## SUBGOAL: Prepare Ingredients -- VIDEO: 75p4UHRIMcU
**# Alignments**: 9
**# K-Means Clusters**: 3
**Inertia**: `0, 0, 1.0669958591461182, 0.028345540165901184, 0.01669555902481079, 0.007820307277143002, 0.0051097869873046875`
**Silhouette Score**: `0, 0, 0.60617775, 0.9006552, 0.667927, 0.5032475, 0.29950002`
#### **Cluster 2**
**Title**: Include Egg Yolks

- **Description**: The current video includes the use of egg yolks in the preparation of ingredients, which is not mentioned in the previous video. This is new because the previous video focused on bacon and cheese without mentioning eggs.
- **Aspects**: materials, instructions
- **New Quotes**: Carbonara is typically made with egg yolks, which is irrelevant, since that's not what we're making.
- **Prev Quotes**: 

**Title**: Add Pepper

- **Description**: The current video includes adding pepper to the egg mixture, which is not present in the previous video. This is a new step in the preparation process.
- **Aspect

In [7]:
print("# BASELINE 1 (PER SUBGOAL)")
checkout_k_means_clusters(video_mapping, ds.alignments_baseline_1)

# BASELINE 1 (PER SUBGOAL)
## SUBGOAL: Prepare Ingredients -- VIDEO: 75p4UHRIMcU
**# Alignments**: 8
**# K-Means Clusters**: 3
**Inertia**: `0, 0, 1.386399507522583, 0.2182372659444809, 0.04262109473347664, 0.01639719307422638, 0.007727698422968388`
**Silhouette Score**: `0, 0, 0.50015146, 0.7555429, 0.6819223, 0.5424305, 0.3319847`
#### **Cluster 1**
**Title**: Egg yolk preparation

- **Description**: The current video introduces a humorous and unconventional method for separating egg yolks, which involves holding the yolk with your thumb or using a back-and-forth motion. This is new because the previous video does not mention egg yolk preparation at all, focusing instead on the choice of meat for carbonara.
- **Aspects**: instructions, tips
- **New Quotes**: Carbonara is typically made with egg yolks, which is irrelevant, since that's not what we're making. Don't accidentally use egg yolkles, because banjo music really ruins this dish. We'll just do the classic hold the yolk with you

In [8]:
print("# BASELINE 2 (PER Video)")
checkout_k_means_clusters(video_mapping, ds.alignments_baseline_2)

# BASELINE 2 (PER Video)
## SUBGOAL: $meta$ -- VIDEO: 75p4UHRIMcU
**# Alignments**: 17
**# K-Means Clusters**: 3
**Inertia**: `0, 0, 2.232497453689575, 0.4840500056743622, 0.2640607953071594, 0.15823876857757568, 0.09931569546461105`
**Silhouette Score**: `0, 0, 0.54968715, 0.7229291, 0.64751315, 0.53495616, 0.5016527`
#### **Cluster 0**
**Title**: Humorous Approach

- **Description**: The current video takes a humorous and satirical approach to making carbonara, which is not present in the previous video. It includes jokes about using bacon instead of guanciale, and humorous commentary on the process, such as the idea of using a drill to grate cheese or the notion of breaking spaghetti being a crime in Italy. This comedic style is a new element compared to the straightforward instructional style of the previous video.
- **Aspects**: context, rationale, tips
- **New Quotes**: Fai schifo cucinare, ja totalmente schifo!, You wouldn't make a hamburger with tiger meat, would you?, If you w