# Youtube Comments Analysis

## Imports

In [1]:
from tqdm import tqdm
from typing import List
import numpy as np


In [2]:
# My own modules
from models.text_models import TextModelManager
from models.math_funcs import cos_sim
from models.llm_api import LLM
from api.youtube_api import YoutubeAPI
from analysis.classification_analysis import ClassificationAnalyzer
from analysis.statements_analysis import StatementsAnalyzer
from analysis.clustering import ClusteringAnalyzer

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Logging
import logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s',  # Define the log format with milliseconds
    datefmt='%Y-%m-%d %H:%M:%S'  # Define the date and time format without milliseconds
)

## Load Models

In [4]:
# Initialize classification models
text_model_manager = TextModelManager()

2024-07-30 09:45:33.491 - models.text_models - INFO - Instantiating TextModelManager.


## Set up LLM

In [5]:
llm = LLM()

2024-07-30 09:45:33.499 - models.llm_api - INFO - Instantiating LLM.


## Youtube API

In [6]:
youtube = YoutubeAPI()

2024-07-30 09:45:33.532 - api.youtube_api - INFO - Instantiating YoutubeAPI.
2024-07-30 09:45:33.535 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [7]:
yt_video_test_id_tomato = "9WQnap-UAiQ"
yt_video_test_id_10k_comments = "2-XxbdR3Nik"
yt_video_test_id_4500_comments = "-ih0B9yn32Q"
yt_video_test_id_4k_comments_beard_meets_schnitzel = "qPd9qPUR2_U"
yt_video_test_id_2000_comments = "rX2tK-qSVpk"
yt_video_test_id_700_comments = "VCXqELB3UPg"
yt_video_test_id_300_comments = "yQqJafC7xv0"
yt_video_test_id_25_comments = "kiF0wgM8zGc"
yt_video_test_id_50_comments = "LHQMIuzjl48"

yt_video_id = yt_video_test_id_50_comments
youtube.set_current_video(yt_video_id)

In [8]:
youtube.get_title()

'Perfect OFFICE Custom Keyboard!'

In [9]:
youtube.get_creator_name()

'Lewis Toh'

In [10]:
# Get comments (for testing)
comments = youtube.get_comments(yt_video_id)

2024-07-30 09:45:33.749 - api.youtube_api - INFO - Starting raw comment retrieval.
2024-07-30 09:45:33.867 - api.youtube_api - INFO - Received 31 top-level comments.
2024-07-30 09:45:33.868 - api.youtube_api - INFO - Finished raw comment retrieval of 31 top-level comments.


Starting comments retrieval for video ID LHQMIuzjl48 ('Perfect OFFICE Custom Keyboard!')


Getting replies for comments with missing replies ...: 100%|██████████| 31/31 [00:00<00:00, 183908.66it/s]
Converting comments to our own class ...: 100%|██████████| 31/31 [00:00<00:00, 9055.82it/s]
Deduplicating comments ...: 100%|██████████| 31/31 [00:00<00:00, 24823.11it/s]


## Clustering

Here, our goal is to find out trends or common themes in the comments.

In [11]:
clustering_analyzer = ClusteringAnalyzer(video_id=yt_video_id, comments=comments)

2024-07-30 09:45:33.888 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [12]:
clus = clustering_analyzer.cluster()

  return self.fget.__get__(instance, owner)()
Calculating embeddings ...: 100%|██████████| 51/51 [00:05<00:00,  8.96it/s]
Clustering ...: 100%|██████████| 36/36 [00:11<00:00,  3.03it/s]
2024-07-30 09:45:51.627 - analysis.clustering - INFO - Best clustering out of 29 is with 3 clusters, with a mean Silhouette coefficient of 0.06048740819096565 (function was <function cluster_spectral_clustering at 0x7f6b435bba30>).
Find cluster topics ...: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]


In [13]:
clus.topics

{0: 'Keyboard and Fish',
 1: 'Custom Keyboard Experiences and Recommendations',
 2: 'Comparing Custom Keyboard Options'}

In [14]:
assert False

AssertionError: 

In [None]:
# TODO: Find out what the "keyboard and fish" cluster is about
# TODO: Then, fuse the clusters by topic

### Fuse clusters based on topic

In [None]:
cluster_groups = [[]]

In [None]:
for lab, topic in cluster_topics.items():
    # Store this cluster label and topic as a tuple
    tup = (lab, topic)
    
    # Try to find a spot for this topic in one of the groups
    found_group = False
    for group in cluster_groups:

        # If the group is empty, add the cluster (this only happens at the beginning)
        if len(group) == 0:
            group.append(tup)
            found_group = True
            break

        # Compare this cluster's embedding with the group
        mean_sim = np.mean([cos_sim(text_model_manager.embed(top), text_model_manager.embed(topic)) for (l, top) in group])
        if mean_sim > 0.55:
            group.append(tup)
            found_group = True
            break

    # If we already found a group, go on to the next cluster's topic
    if found_group:
        continue

    # Start a new group
    cluster_groups.append([tup])

In [None]:
def build_prompt_fuse_topics(video_info, topics: List[str]):
    title = get_title(video_info)
    lines = [f"You are a professional YouTube comment analyst. Given a video title and some comment topics, find a new description of the topic that reflects the core concept of the listed topics."]
    lines.append(f"Video title: {title}")
    
    lines.append("\nComment topics:")
    lines += [f"- {t}" for t in topics]

    lines.append("\nExtract a single, coherent topic that describes all these topics. The topic you find can also be about the style or mood of the comments. " \
                 "A topic should be a simple notion, e.g., \"Jokes\" or \"Choosing a keyboard\"." \
                 "There is no need to repeat the video title in your assessment. The topic shouldn't be, e.g., \"Reactions to Video\" or anything generic of that sort. Provide your assessment in the form of JSON such as {\"topic\": your_topic_goes_here}.")

    prompt = "\n".join(lines)
    return prompt

In [None]:
# Fuse groups we found by finding a new topic
fused_groups = []
for group in tqdm(cluster_groups, desc="Fusing groups ..."):
    labs, topics = zip(*group)

    if len(topics) > 1:
        prompt = build_prompt_fuse_topics(info, topics)
        res_raw = llm.chat(prompt)
        topic = post_process_single_entry_json(res_raw)
    else:
        topic = topics[0]

    fused_groups.append((labs, topic))

In [None]:
# Change labeling of clustering to reflect group fusions
for label_group, topic in fused_groups:
    # No need to change any labels if we "group" doesn't have multiple labels
    if len(label_group) <= 1:
        continue

    # Paint all labels in group to match the first label
    label_group = list(label_group)
    lab_first = label_group.pop(0)
    for lab in label_group:
        labels[np.where(labels == lab)] = lab_first

    # Remember topic
    cluster_topics[lab_first] = topic

In [None]:
labs_unique = list(np.unique(labels))

In [None]:
plot(emb_matrix, labs_unique, labels)

In [None]:
describe_clusters()

In [None]:
assert False

## LLM Statement Extraction

In [None]:
statements_analyzer = StatementsAnalyzer(
    video_id=yt_video_id,
    comments=comments
)

In [None]:
statements_analyzer.run_analysis(
    limit_statements=2  # For testing, limit number of statements
)

## Classification Analysis

In [None]:
classification_analyzer = ClassificationAnalyzer(comments)
print(classification_analyzer.run_all_analyses())