# Youtube Comments Analysis

## Imports

In [1]:
import os
from collections import defaultdict
from tqdm import tqdm
from typing import List
import numpy as np
import pandas as pd
from glob import glob
import json
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, HDBSCAN, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples
import umap.umap_ as umap
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# My own modules
from util.string_utils import split_text_if_long
from models.text_models import TextModelManager
from models.computations import ClassificationType
from models.math_funcs import cos_sim
from models.llm_api import LLM
from api.youtube_api import YoutubeAPI
from analysis.classification_analysis import ClassificationAnalyzer
from analysis.statements_analysis import StatementsAnalyzer

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Logging
import logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s',  # Define the log format with milliseconds
    datefmt='%Y-%m-%d %H:%M:%S'  # Define the date and time format without milliseconds
)

## Load Models

In [4]:
# Initialize classification models
text_model_manager = TextModelManager()

2024-07-27 23:53:21.988 - models.text_models - INFO - Instantiating TextModelManager.


## Set up LLM

In [5]:
llm = LLM()

2024-07-27 23:53:21.999 - models.llm_api - INFO - Instantiating LLM.


## Youtube API

In [6]:
youtube = YoutubeAPI()

2024-07-27 23:53:22.038 - api.youtube_api - INFO - Instantiating YoutubeAPI.
2024-07-27 23:53:22.042 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [7]:
yt_video_test_id_tomato = "9WQnap-UAiQ"
yt_video_test_id_10k_comments = "2-XxbdR3Nik"
yt_video_test_id_4500_comments = "-ih0B9yn32Q"
yt_video_test_id_4k_comments_beard_meets_schnitzel = "qPd9qPUR2_U"
yt_video_test_id_2000_comments = "rX2tK-qSVpk"
yt_video_test_id_700_comments = "VCXqELB3UPg"
yt_video_test_id_300_comments = "yQqJafC7xv0"
yt_video_test_id_25_comments = "kiF0wgM8zGc"
yt_video_test_id_50_comments = "LHQMIuzjl48"

yt_video_id = yt_video_test_id_50_comments
youtube.set_current_video(yt_video_id)

In [8]:
youtube.get_title()

'Perfect OFFICE Custom Keyboard!'

In [9]:
youtube.get_creator_name()

'Lewis Toh'

In [10]:
# Get comments (for testing)
comments = youtube.get_comments(yt_video_id)

2024-07-27 23:53:22.271 - api.youtube_api - INFO - Starting raw comment retrieval.
2024-07-27 23:53:22.411 - api.youtube_api - INFO - Received 31 top-level comments.
2024-07-27 23:53:22.412 - api.youtube_api - INFO - Finished raw comment retrieval of 31 top-level comments.


Starting comments retrieval for video ID LHQMIuzjl48 ('Perfect OFFICE Custom Keyboard!')


Getting replies for comments with missing replies ...: 100%|██████████| 31/31 [00:00<00:00, 137155.51it/s]
Converting comments to our own class ...: 100%|██████████| 31/31 [00:00<00:00, 4851.26it/s]
Deduplicating comments ...: 100%|██████████| 31/31 [00:00<00:00, 9276.79it/s]


## LLM Statement Extraction

In [11]:
statements_analyzer = StatementsAnalyzer(
    video_id=yt_video_id,
    comments=comments
)

2024-07-27 23:53:22.445 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [12]:
statements_analyzer.run_analysis(
    limit_statements=2  # For testing, limit number of statements
)

Grouping by sentiment ...: 100%|██████████| 31/31 [00:01<00:00, 20.36it/s]
Measuring statement agreement with comments ...: 100%|██████████| 124/124 [03:40<00:00,  1.78s/it]
2024-07-27 23:57:08.472 - analysis.statements_analysis - INFO - Score for statement 'The video makes me want to try out other keyboards.' -> 0.31
2024-07-27 23:57:08.472 - analysis.statements_analysis - INFO - Score for statement 'The keyboard is a good value for its price.' -> 0.63
2024-07-27 23:57:08.473 - analysis.statements_analysis - INFO - Score for statement 'The black version of the keyboard looks great.' -> 1.04
2024-07-27 23:57:08.473 - analysis.statements_analysis - INFO - Score for statement 'Some viewers are experiencing issues with their keyboard, such as ticking noises or double spacing.' -> -0.25
2024-07-27 23:57:08.474 - analysis.statements_analysis - INFO - Statement 'The video makes me want to try out other keyboards.'->  8.82% are discussing this, out of those 100% agree
2024-07-27 23:57:08.474 

## Embedding and Clustering

Here, our goal is to find out trends or common themes in the comments.

In [None]:
comments_for_clustering = flatten_comments(comments)

In [None]:
emb_vecs = []
for comm in tqdm(comments_for_clustering, desc="Calculating embeddings ..."):
    emb_vecs.append(comm.get_embedding())

In [None]:
emb_matrix = np.stack(emb_vecs)
emb_matrix.shape

Let's cluster.

In [None]:
def cluster_kmeans(matrix, n=5):
    clustering_method = KMeans(n_clusters=n)
    clustering_method.fit(matrix)
    return clustering_method.labels_

In [None]:
def cluster_spectral_clustering(matrix, n=5):
    clustering_method = SpectralClustering(n_clusters=n)
    clustering_method.fit(matrix)
    return clustering_method.labels_

In [None]:
def cluster_dbscan(matrix, n=5):
    # argument `n` is ignored
    clustering_method = DBSCAN()
    clustering_method.fit(matrix)
    return clustering_method.labels_

In [None]:
def cluster_optics(matrix, n=5):
    # argument `n` is ignored
    clustering_method = OPTICS()
    clustering_method.fit(matrix)
    return clustering_method.labels_

In [None]:
def cluster_hdbscan(matrix, n=5):
    # argument `n` is ignored
    clustering_method = HDBSCAN()
    clustering_method.fit(matrix)
    return clustering_method.labels_

In [None]:
def cluster_gmm(matrix, n=5):
    clustering_method = GaussianMixture(n_components=n)
    clustering_method.fit(matrix)
    labels = clustering_method.predict(matrix)
    return labels

In [None]:
def eval_clustering(matrix, labels):
    labs_unique = list(np.unique(labels))
    
    # Silhouette score for each sample (i.e., comment)
    try:
        sil_all = silhouette_samples(matrix, labels)
    except ValueError:
        # this may happen if there is only one label
        sil_all = np.copy(labels)
        sil_all.fill(-1)  # worst possible value
    
    # Silhouette score, aggregated by cluster
    sil_for_labels = [np.mean(sil_all[np.where(labels == lab)[0]]) for lab in labs_unique]
    
    return labs_unique, sil_for_labels

In [None]:
clusterings = []
n_range = [2, 3, 4, 5, 6, 7, 8, 16, 32, 64]
clus_funs = [cluster_kmeans, cluster_gmm, cluster_spectral_clustering, cluster_hdbscan]

In [None]:
for n, clus_fun in tqdm(list(itertools.product(n_range, clus_funs)), desc="Clustering ..."):
    # Cluster
    labels = clus_fun(emb_matrix, n=n)

    # Evaluate
    labs_unique, sil_for_labels = eval_clustering(emb_matrix, labels)
    
    clusterings.append((labels, labs_unique, sil_for_labels, n, clus_fun))

In [None]:
# Remove clustering if it is degenerate (i.e., the majority of points are in a single cluster)
cluster_sizes = [[len(np.where(labels == lab)[0]) / len(labels) for lab in labs_unique] for (labels, labs_unique, _, _, _) in clusterings]
cluster_sizes = [(idx, si, min(2 / len(si), 0.8)) for (idx, si) in enumerate(cluster_sizes)]
legal_indices = [idx for (idx, si, limit) in cluster_sizes if (max(si) <= limit)]
clusterings = [clusterings[idx] for idx in legal_indices]

In [None]:
print(f"After filtering out degenerate clusterings, proceeding with {len(clusterings)} clusterings.")

In [None]:
# Sort by mean of Silhouette coefficient: largest first
clusterings.sort(key=lambda t: np.mean(t[2]), reverse=True)

In [None]:
# Find the best clustering
labels, labs_unique, sil_for_labels, n, clus_fun = clusterings[0]
print(f"Best clustering out of {len(clusterings)} is with n = {n}, with a mean Silhouette coefficient of {np.mean(sil_for_labels):0.8f} (function was {clus_fun}).")

In [None]:
# Prepare colormap for plotting
cm_steps = len(labs_unique)

In [None]:
hsv = mpl.colormaps.get_cmap('hsv')
cmap = mpl.colors.ListedColormap(hsv(np.linspace(0,1,cm_steps + 1)[:-1]))
cmap

In [None]:
def color_for_idx(idx, colormap):
    return colormap.colors[idx]

In [None]:
def color_for_label(label, labels_unique, colormap):
    idx = labels_unique.index(label)
    return color_for_idx(idx, colormap)

In [None]:
def plot(matrix, labels_unique, labels, use_umap=True):
    if use_umap:
        # Use UMAP
        reducer = umap.UMAP()
    else:
        # Use t-SNE
        reducer = TSNE(
            n_components=2,
            learning_rate='auto',
            init='random',
            perplexity=3
        )

    # Fit
    matrix_2d = reducer.fit_transform(matrix)

    # Plot
    plt.scatter(x=matrix_2d[:, 0], y=matrix_2d[:, 1], c=[color_for_label(lab, labels_unique, cmap) for lab in labels])

In [None]:
plot(emb_matrix, labs_unique, labels)

In [None]:
def build_prompt_find_topic(video_info, comments: List[Comment]):
    title = get_title(video_info)
    lines = [f"You are a professional YouTube comment analyst. Given a video title and some comments, find the topic of the comments."]
    lines.append(f"Video title: {title}")
    
    lines.append("\nSample from the comments:")
    comm_lines = sample_from_comments(comments)
    lines += comm_lines

    lines.append("\nExtract a single, coherent topic that these comments are discussing. The topic you find can also be about the style or mood of the comments. " \
                 "A topic should be a simple notion, e.g., \"Jokes\" or \"Choosing a keyboard\"." \
                 "There is no need to repeat the video title in your assessment. The topic should also describe what the comments are saying, so it shouldn't be, e.g., \"Reactions to Video\" or anything generic of that sort. Provide your assessment in the form of JSON such as {\"topic\": your_topic_goes_here}.")

    prompt = "\n".join(lines)
    return prompt

In [None]:
divider_width = 100
divider_str = "-"
show_random_comments = False
cluster_topics = {}

In [None]:
def describe_clusters():
    for lab in labs_unique:
        print(f"Cluster Description (Label {lab})".center(divider_width, divider_str))
    
        # Size
        cluster_size = sum(labels == lab)
        print(f"- Cluster size: {cluster_size} ({100 * cluster_size / len(labels):0.2f}%)")
    
        # Get indices
        clus_indices = np.where(labels == lab)[0]
        
        # Find mean embedding of cluster
        clus_mean_emb = np.mean(np.stack([emb_matrix[idx] for idx in clus_indices]), axis=0)
        
        # Sort comments by distance to mean embedding
        clus_comments = [comments_for_clustering[idx] for idx in clus_indices]
        clus_comments.sort(key=lambda comment: np.sum(np.abs(comment.get_embedding()) - clus_mean_emb))
    
        # Find out central topic of cluster
        if lab not in cluster_topics:
            clus_comments_central = clus_comments[:1000]
            prompt = build_prompt_find_topic(info, clus_comments_central)
            res_raw = llm.chat(prompt)
            topic = post_process_single_entry_json(res_raw)
            cluster_topics[lab] = topic
        print(f"- Central topic (LLM): {cluster_topics[lab]}")
    
        # Show comment closest to the mean
        print("- Comment closest to mean embedding:")
        print(clus_comments[0])
    
        # Show random comments
        if show_random_comments:
            rnd_indices = np.random.choice(clus_indices, size=min(5, cluster_size), replace=False)
            print()
            print(f"- {len(rnd_indices)} random comments from this cluster: ")
            for idx in rnd_indices:
                print(f"- {comments_for_clustering[idx]}")
    
        print("".center(divider_width, divider_str))
        
        print()

In [None]:
describe_clusters()

### Fuse clusters based on topic

In [None]:
cluster_groups = [[]]

In [None]:
for lab, topic in cluster_topics.items():
    # Store this cluster label and topic as a tuple
    tup = (lab, topic)
    
    # Try to find a spot for this topic in one of the groups
    found_group = False
    for group in cluster_groups:

        # If the group is empty, add the cluster (this only happens at the beginning)
        if len(group) == 0:
            group.append(tup)
            found_group = True
            break

        # Compare this cluster's embedding with the group
        mean_sim = np.mean([cos_sim(text_model_manager.embed(top), text_model_manager.embed(topic)) for (l, top) in group])
        if mean_sim > 0.55:
            group.append(tup)
            found_group = True
            break

    # If we already found a group, go on to the next cluster's topic
    if found_group:
        continue

    # Start a new group
    cluster_groups.append([tup])

In [None]:
def build_prompt_fuse_topics(video_info, topics: List[str]):
    title = get_title(video_info)
    lines = [f"You are a professional YouTube comment analyst. Given a video title and some comment topics, find a new description of the topic that reflects the core concept of the listed topics."]
    lines.append(f"Video title: {title}")
    
    lines.append("\nComment topics:")
    lines += [f"- {t}" for t in topics]

    lines.append("\nExtract a single, coherent topic that describes all these topics. The topic you find can also be about the style or mood of the comments. " \
                 "A topic should be a simple notion, e.g., \"Jokes\" or \"Choosing a keyboard\"." \
                 "There is no need to repeat the video title in your assessment. The topic shouldn't be, e.g., \"Reactions to Video\" or anything generic of that sort. Provide your assessment in the form of JSON such as {\"topic\": your_topic_goes_here}.")

    prompt = "\n".join(lines)
    return prompt

In [None]:
# Fuse groups we found by finding a new topic
fused_groups = []
for group in tqdm(cluster_groups, desc="Fusing groups ..."):
    labs, topics = zip(*group)

    if len(topics) > 1:
        prompt = build_prompt_fuse_topics(info, topics)
        res_raw = llm.chat(prompt)
        topic = post_process_single_entry_json(res_raw)
    else:
        topic = topics[0]

    fused_groups.append((labs, topic))

In [None]:
# Change labeling of clustering to reflect group fusions
for label_group, topic in fused_groups:
    # No need to change any labels if we "group" doesn't have multiple labels
    if len(label_group) <= 1:
        continue

    # Paint all labels in group to match the first label
    label_group = list(label_group)
    lab_first = label_group.pop(0)
    for lab in label_group:
        labels[np.where(labels == lab)] = lab_first

    # Remember topic
    cluster_topics[lab_first] = topic

In [None]:
labs_unique = list(np.unique(labels))

In [None]:
plot(emb_matrix, labs_unique, labels)

In [None]:
describe_clusters()

## Classification Analysis

In [None]:
classification_analyzer = ClassificationAnalyzer(comments)
print(classification_analyzer.run_all_analyses())