# Youtube Comments Analysis

## Imports

In [1]:
from tqdm import tqdm
from typing import List
import numpy as np


In [2]:
# My own modules
from models.text_models import TextModelManager
from models.llm_api import LLM
from api.youtube_api import YoutubeAPI
from analysis.classification_analysis import ClassificationAnalyzer
from analysis.statements_analysis import StatementsAnalyzer
from analysis.clustering import ClusteringAnalyzer

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Logging
import logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s.%(msecs)03d - %(name)s - %(levelname)s - %(message)s',  # Define the log format with milliseconds
    datefmt='%Y-%m-%d %H:%M:%S'  # Define the date and time format without milliseconds
)

## Load Models

In [4]:
# Initialize classification models
text_model_manager = TextModelManager()

2024-07-31 09:59:44.326 - models.text_models - INFO - Instantiating TextModelManager.


## Set up LLM

In [5]:
llm = LLM()

2024-07-31 09:59:44.336 - models.llm_api - INFO - Instantiating LLM.


## Youtube API

In [6]:
youtube = YoutubeAPI()

2024-07-31 09:59:44.374 - api.youtube_api - INFO - Instantiating YoutubeAPI.
2024-07-31 09:59:44.376 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [7]:
yt_video_test_id_tomato = "9WQnap-UAiQ"
yt_video_test_id_10k_comments = "2-XxbdR3Nik"
yt_video_test_id_4500_comments = "-ih0B9yn32Q"
yt_video_test_id_4k_comments_beard_meets_schnitzel = "qPd9qPUR2_U"
yt_video_test_id_2000_comments = "rX2tK-qSVpk"
yt_video_test_id_700_comments = "VCXqELB3UPg"
yt_video_test_id_300_comments = "yQqJafC7xv0"
yt_video_test_id_25_comments = "kiF0wgM8zGc"
yt_video_test_id_50_comments = "LHQMIuzjl48"

yt_video_id = yt_video_test_id_700_comments
youtube.set_current_video(yt_video_id)

In [8]:
youtube.get_title()

'You can mix 10 marbles until they sort themselves. Why not 100?'

In [9]:
youtube.get_creator_name()

'AlphaPhoenix'

In [10]:
# Get comments (for testing)
comments = youtube.get_comments(yt_video_id)

2024-07-31 09:59:44.564 - api.youtube_api - INFO - Starting raw comment retrieval.
2024-07-31 09:59:44.760 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:44.761 - api.youtube_api - INFO - Requesting another page (page 2 of at most 14) ...


Starting comments retrieval for video ID VCXqELB3UPg ('You can mix 10 marbles until they sort themselves. Why not 100?')


2024-07-31 09:59:44.937 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:44.937 - api.youtube_api - INFO - Requesting another page (page 3 of at most 14) ...
2024-07-31 09:59:45.090 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:45.090 - api.youtube_api - INFO - Requesting another page (page 4 of at most 14) ...
2024-07-31 09:59:45.242 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:45.244 - api.youtube_api - INFO - Requesting another page (page 5 of at most 14) ...
2024-07-31 09:59:45.439 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:45.442 - api.youtube_api - INFO - Requesting another page (page 6 of at most 14) ...
2024-07-31 09:59:45.599 - api.youtube_api - INFO - Received 100 top-level comments.
2024-07-31 09:59:45.600 - api.youtube_api - INFO - Requesting another page (page 7 of at most 14) ...
2024-07-31 09:59:45.775 - api.youtube_api - INFO - Received 100 top-le

## Clustering

Here, our goal is to find out trends or common themes in the comments.

In [11]:
clustering_analyzer = ClusteringAnalyzer(video_id=yt_video_id, comments=comments)

2024-07-31 09:59:47.326 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [12]:
clus = clustering_analyzer.cluster()

  return self.fget.__get__(instance, owner)()
Calculating embeddings ...: 100%|██████████| 1399/1399 [01:22<00:00, 16.97it/s]
Clustering ...: 100%|██████████| 40/40 [01:13<00:00,  1.85s/it]
2024-07-31 10:02:23.871 - analysis.clustering - INFO - Best clustering out of 25 is with 32 clusters, with a mean Silhouette coefficient of 0.08842793107032776 (function was <function cluster_spectral_clustering at 0x7fc30270fd90>).
Find cluster topics ...: 100%|██████████| 32/32 [00:13<00:00,  2.39it/s]
Fusing groups ...: 100%|██████████| 18/18 [00:02<00:00,  8.11it/s]


In [13]:
clus.topics

{0: 'Physical Principles of Random Processes',
 2: 'Pondering Chaos and Uncertainty',
 3: 'Exploring Entropy Concepts',
 6: 'Intellectual Curiosity',
 9: 'Video Appreciation',
 10: 'Nerdy Humor in Science'}

In [14]:
# TODO: Print out cluster topics along with cluster sizes (absolute and in percent)

In [15]:
assert False

AssertionError: 

## LLM Statement Extraction

In [None]:
statements_analyzer = StatementsAnalyzer(
    video_id=yt_video_id,
    comments=comments
)

In [None]:
statements_analyzer.run_analysis(
    limit_statements=2  # For testing, limit number of statements
)

## Classification Analysis

In [None]:
classification_analyzer = ClassificationAnalyzer(comments)
print(classification_analyzer.run_all_analyses())