Step 1: Scrapping youtube video's comments using GCP's service - YouTube Data API v3.

In [2]:
import urllib.request
import urllib.parse
import json
import csv
import time
import os
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
VIDEO_ID = "dc5S4IV_NeA"   # from the URL
MAX_RESULTS = 100          # max per page (YouTube allows up to 100)

def build_url(page_token=None):
    base = "https://www.googleapis.com/youtube/v3/commentThreads"
    params = {
        "part": "snippet",
        "videoId": VIDEO_ID,
        "key": API_KEY,
        "textFormat": "plainText",
        "maxResults": str(MAX_RESULTS),
        "order": "time"       # or "relevance"
    }
    if page_token:
        params["pageToken"] = page_token
    return base + "?" + urllib.parse.urlencode(params)

def fetch_page(page_token=None):
    url = build_url(page_token)
    with urllib.request.urlopen(url) as response:
        data = response.read().decode("utf-8")
    return json.loads(data)

def scrape_comments():
    all_comments = []
    next_page_token = None
    page = 1

    while True:
        print(f"Fetching page {page}...")
        data = fetch_page(next_page_token)

        for item in data.get("items", []):
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            author = snippet.get("authorDisplayName", "")
            text = snippet.get("textDisplay", "")
            published_at = snippet.get("publishedAt", "")
            like_count = snippet.get("likeCount", 0)

            all_comments.append({
                "author": author,
                "text": text,
                "published_at": published_at,
                "likes": like_count
            })

        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break

        page += 1
        # polite pause so we don't hammer the API
        time.sleep(0.5)

    return all_comments

def save_to_csv(comments, filename="data/youtube_comments.csv"):
    if not comments:
        return
    fieldnames = ["author", "text", "published_at", "likes"]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in comments:
            writer.writerow(row)


comments = scrape_comments()
print(f"Total comments fetched: {len(comments)}")
save_to_csv(comments)
print("Saved to data/youtube_comments.csv")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Total comments fetched: 857
Saved to data/youtube_comments.csv


Step 2: Data cleaning

In [3]:
import pandas as pd
import re

# 1. Load CSV
df = pd.read_csv("data/youtube_comments.csv")

# ------- CLEANING FUNCTIONS -------

# Remove @username
def remove_usernames(text):
    return re.sub(r"@\w+", "", text)

# Remove URLs
def remove_urls(text):
    return re.sub(r"http\S+|www\.\S+", "", text)

# Remove emojis (optional)
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["  
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002500-\U00002BEF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r"", text)

# Remove extra spaces, quotes, newlines
def clean_spacing(text):
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ------- APPLY CLEANING -------

df["clean_text"] = df["text"].astype(str)

df["clean_text"] = df["clean_text"].apply(remove_usernames)
df["clean_text"] = df["clean_text"].apply(remove_urls)
df["clean_text"] = df["clean_text"].apply(remove_emojis)
df["clean_text"] = df["clean_text"].apply(clean_spacing)
df["clean_text"] = df["clean_text"].str.lower()

# 2. Drop duplicates
df = df.drop_duplicates(subset="clean_text")

# 3. Convert timestamp to datetime
df["published_at"] = pd.to_datetime(df["published_at"], errors="coerce")

# 4. Optionally remove meaningless comments (e.g., only emoji, single words)
df = df[df["clean_text"].str.len() > 3]

# Save cleaned file
df.to_csv("data/clean_comments.csv", index=False)

print("Cleaning complete. Saved as data/clean_comments.csv")
df.head(10)

Cleaning complete. Saved as data/clean_comments.csv


Unnamed: 0,author,text,published_at,likes,clean_text
0,@guardiannews,John Lewis ad kickstarts Christmas countdown t...,2025-11-04 09:21:26+00:00,40,john lewis ad kickstarts christmas countdown t...
1,@Gary-s9r6x,So he hugged his son after years just because ...,2025-11-15 01:01:56+00:00,0,so he hugged his son after years just because ...
2,@Jess-qy6pm,Was a black singer!!!!!,2025-11-14 22:30:54+00:00,0,was a black singer!!!!!
3,@hopeack12345,‚ù§‚ù§‚ù§‚ù§ Came here after watching the diabolical m...,2025-11-14 22:23:44+00:00,0,came here after watching the diabolical m&s ad...
4,@LessMarxMoreMises,Worst Christmas ad in history! Was it written ...,2025-11-14 21:01:06+00:00,0,worst christmas ad in history! was it written ...
5,@rosson1983,Excellent advert. As usual JL blows the other ...,2025-11-14 19:24:21+00:00,0,excellent advert. as usual jl blows the other ...
6,@jackieplatts4359,"Oh this is lovely and, yes, I‚Äôm crying ü•π",2025-11-14 17:38:26+00:00,0,"oh this is lovely and, yes, i‚Äôm crying ü•π"
7,@m535i,Well done to all those who put this amazing Jo...,2025-11-14 16:18:35+00:00,0,well done to all those who put this amazing jo...
8,@juliabroadley8411,Beautiful üò¢‚ù§,2025-11-14 14:40:39+00:00,0,beautiful
9,@666kismett,Another disappointing Christmas advert,2025-11-14 12:44:17+00:00,0,another disappointing christmas advert


Step 3: Topic Modeling 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

N_TOPICS = 6          # tweak this after you see results
N_TOP_WORDS = 12

df = pd.read_csv("data/clean_comments.csv")
texts = df["clean_text"].fillna("").tolist()

# 1) TF-IDF
tfidf = TfidfVectorizer(
    max_df=0.9,
    min_df=5,           # ignore very rare words
    stop_words="english"
)
X = tfidf.fit_transform(texts)

# 2) LDA
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    random_state=42,
    learning_method="batch"
)
topic_distributions = lda.fit_transform(X)   # shape: (n_docs, N_TOPICS)

# 3) Show top words per topic to help you interpret & label them
feature_names = tfidf.get_feature_names_out()

def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        print(f"Topic {topic_idx}: {' | '.join(top_words)}")

print_topics(lda, feature_names, N_TOP_WORDS)

# 4) Assign dominant topic per comment
df["topic_id"] = topic_distributions.argmax(axis=1)


topic_labels = {
    0: "music themes & general praise",
    1: "strong emotional praise",
    2: "nostalgia & memories",
    3: "general christmas advert discussion",
    4: "mixed evaluations (best vs worst)",
    5: "family themes & representation"
}
df["topic_label"] = df["topic_id"].map(topic_labels)

df.to_csv("data/all_comments_with_topics.csv", index=False)
print("Saved with topics: data/all_comments_with_topics.csv")

Topic 0: don | tune | nice | better | absolutely | music | did | love | just | didn | tear | lives
Topic 1: beautiful | crying | amazing | advert | jl | oh | real | got | just | great | emotional | doesn
Topic 2: aftersun | brilliant | song | worst | guardian | hard | right | seen | feels | miss | hit | got
Topic 3: lewis | john | advert | christmas | ad | great | time | best | lovely | brilliant | xmas | adverts
Topic 4: best | just | years | christmas | year | love | advert | dads | people | think | son | getting
Topic 5: love | family | wow | nailed | tears | white | know | video | bring | dad | finally | wonderful
Saved with topics: data/all_comments_with_topics.csv


Step 4: Sentiment Analysis

In [5]:
from transformers import pipeline

df = pd.read_csv("data/all_comments_with_topics.csv")

sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

texts = df["clean_text"].fillna("").tolist()

labels = []
scores = []

# run in small batches so it‚Äôs not super slow
BATCH_SIZE = 32
for i in range(0, len(texts), BATCH_SIZE):
    batch = texts[i:i+BATCH_SIZE]
    results = sentiment_model(batch)
    for r in results:
        labels.append(r["label"])   # POSITIVE / NEGATIVE
        scores.append(r["score"])

df["sentiment_label"] = labels
df["sentiment_score"] = scores

df.to_csv("data/all_comments_with_topics_and_sentiment.csv", index=False)
print("Saved : data/all_comments_with_topics_and_sentiment.csv")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


Saved : data/all_comments_with_topics_and_sentiment.csv


In [6]:
# Load the final file with topics + sentiment
df = pd.read_csv("data/all_comments_with_topics_and_sentiment.csv")

# ------------------------------
# 1. Overall Sentiment Summary
# ------------------------------
print("\n==== Overall Sentiment Distribution ====\n")
print(df["sentiment_label"].value_counts(normalize=True).apply(lambda x: round(x*100, 2)))

# ------------------------------
# 2. Topic Distribution
# ------------------------------
print("\n==== Topic Distribution (Fraction of Comments) ====\n")
print(df["topic_label"].value_counts(normalize=True).apply(lambda x: round(x*100, 2)))

# ------------------------------
# 3. Sentiment Within Each Topic
# ------------------------------
print("\n==== Sentiment Within Each Topic ====\n")
sentiment_within_topics = (
    df.groupby("topic_label")["sentiment_label"]
      .value_counts(normalize=True)
      .rename("percentage")
      .mul(100)
      .round(2)
)

print(sentiment_within_topics)


==== Overall Sentiment Distribution ====

sentiment_label
POSITIVE    66.67
NEGATIVE    33.33
Name: proportion, dtype: float64

==== Topic Distribution (Fraction of Comments) ====

topic_label
general christmas advert discussion    22.88
mixed evaluations (best vs worst)      17.47
music themes & general praise          15.99
nostalgia & memories                   15.50
strong emotional praise                14.15
family themes & representation         14.02
Name: proportion, dtype: float64

==== Sentiment Within Each Topic ====

topic_label                          sentiment_label
family themes & representation       POSITIVE           72.81
                                     NEGATIVE           27.19
general christmas advert discussion  POSITIVE           65.59
                                     NEGATIVE           34.41
mixed evaluations (best vs worst)    POSITIVE           64.08
                                     NEGATIVE           35.92
music themes & general praise        P