# 📓 YouTube Comment Analysis Notebook
This notebook processes YouTube comment CSVs, classifies topics with zero-shot learning, analyzes sentiment, reranks comments, and outputs summaries.

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import json
import pandas as pd
from pathlib import Path
from transformers import pipeline
from tqdm.auto import tqdm
import traceback

tqdm.pandas()

In [2]:
PRODUCT = "wireless over-ear headphones"

In [3]:
DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

# Load feature labels from JSON
with open(META_PATH, "r") as f:
    metadata = json.load(f)

features = [o for o in metadata["metrics"]]
display(features)


['Sound Quality',
 'Comfort',
 'Noise Cancellation',
 'Battery Life',
 'Durability']

In [4]:
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="tabularisai/multilingual-sentiment-analysis"
)

Device set to use cuda:0
Device set to use cuda:0


In [5]:
def classify_topic(texts, batch_size=16):
    if isinstance(texts, str):
        texts = [texts]
    results = zero_shot_classifier(
        texts,
        candidate_labels=features,
        truncation=True,
        batch_size=batch_size
    )
    if isinstance(results, dict):
        results = [results]
    return [max(zip(r["scores"], r["labels"]))[1] for r in results]

def classify_sentiment(texts):
    texts = [text.strip() for text in texts]
    results = sentiment_classifier(texts, truncation=True, batch_size=32)
    return [r["label"] for r in results]

def chunk_apply(arr, func, chunk_size=64):
    return sum((func(arr[i:i+chunk_size]) for i in range(0, len(arr), chunk_size)), [])

def rerank_comments_df(df):
    df['text_length'] = df['text'].astype(str).str.len()
    df['likes'] = pd.to_numeric(df['likes'], errors='coerce').fillna(0)
    df['updatedAt'] = pd.to_datetime(df['updatedAt'], errors='coerce')

    min_time = df['updatedAt'].min()
    max_time = df['updatedAt'].max()
    df['recency_score'] = (df['updatedAt'] - min_time).dt.total_seconds() / (
        (max_time - min_time).total_seconds() + 1e-5
    )
    df['norm_likes'] = (df['likes'] - df['likes'].min()) / (df['likes'].max() - df['likes'].min() + 1e-5)
    df['norm_length'] = (df['text_length'] - df['text_length'].min()) / (df['text_length'].max() - df['text_length'].min() + 1e-5)
    df['quality_score'] = df['norm_likes'] + df['norm_length'] + df['recency_score']
    return df.sort_values(by=['features', 'quality_score'], ascending=[True, False]) if 'features' in df.columns else df.sort_values(by='quality_score', ascending=False)

sentiment_to_score = {
    "Very Negative": -2,
    "Negative": -1,
    "Neutral": 0,
    "Positive": 1,
    "Very Positive": 2
}

def senti_score(df, path, processed_data):
    df['senti_score'] = df['sentiment'].map(sentiment_to_score)
    out_path = path.parent.with_name("processed_comments") / path.name
    out_path.parent.mkdir(parents=True, exist_ok=True)
    print("Saving to:", out_path)
    print("DataFrame shape before saving:", df.shape)
    df.to_csv(out_path, index=False)
    print(f"Saved: {out_path.name}")
    processed_data.append((df, path))

def summarize_feature_scores(processed_data):
    feature_scores = []
    for df, path in processed_data:
        avg_scores = df.groupby('features')['senti_score'].mean()
        for feature, score in avg_scores.items():
            feature_scores.append({"product": path.stem, "feature": feature, "avg_senti_score": score})
    summary_df = pd.DataFrame(feature_scores)
    summary_df = summary_df.pivot(index='product', columns='feature', values='avg_senti_score').reset_index()
    return summary_df

In [6]:
# Set data directory (change path as needed)
data_dir = DATA_DIR / "youtube" / "raw_comments"
print("Using data directory:", data_dir)
csv_paths = list(data_dir.glob("**/*.csv"))
print("CSV paths found:", csv_paths)

Using data directory: session/wireless over-ear headphones/youtube/raw_comments
CSV paths found: [PosixPath('session/wireless over-ear headphones/youtube/raw_comments/Bose_QuietComfort_Ultra_Headphones.csv'), PosixPath('session/wireless over-ear headphones/youtube/raw_comments/Sony_WH_1000XM5.csv'), PosixPath('session/wireless over-ear headphones/youtube/raw_comments/Focal_Bathys.csv'), PosixPath('session/wireless over-ear headphones/youtube/raw_comments/Anker_Soundcore_Space_One.csv'), PosixPath('session/wireless over-ear headphones/youtube/raw_comments/Apple_AirPods_Max.csv')]


In [7]:
processed_data = []
for path in csv_paths:
    print(f"\nProcessing {path.name}...")
    try:
        df = pd.read_csv(path)
        print("Loaded CSV. Columns:", df.columns)

        required_columns = {'text', 'likes', 'updatedAt'}
        if not required_columns.issubset(df.columns):
            print(f"Missing columns in {path.name}: {required_columns - set(df.columns)}")
            continue

        df = df[df['text'].notnull()]
        df['updatedAt'] = pd.to_datetime(df['updatedAt'], errors='coerce')

        print("Classifying topics...")
        df["features"] = chunk_apply(df["text"].tolist(), classify_topic)

        print("Reranking...")
        df = rerank_comments_df(df)

        print("Selecting top 50 comments per feature...")
        df = df.groupby('features').head(50).reset_index(drop=True)

        print("Performing sentiment analysis...")
        df['sentiment'] = chunk_apply(df["text"].tolist(), classify_sentiment)

        senti_score(df, path, processed_data)

    except Exception as e:
        print(f"Error processing {path.name}: {e}")
        traceback.print_exc()


Processing Bose_QuietComfort_Ultra_Headphones.csv...
Loaded CSV. Columns: Index(['product', 'category', 'id', 'videoId', 'parentId', 'isReply',
       'authorName', 'authorId', 'text', 'likes', 'updatedAt'],
      dtype='object')
Classifying topics...
Reranking...
Selecting top 50 comments per feature...
Performing sentiment analysis...
Saving to: session/wireless over-ear headphones/youtube/processed_comments/Bose_QuietComfort_Ultra_Headphones.csv
DataFrame shape before saving: (187, 19)
Saved: Bose_QuietComfort_Ultra_Headphones.csv

Processing Sony_WH_1000XM5.csv...
Loaded CSV. Columns: Index(['product', 'category', 'id', 'videoId', 'parentId', 'isReply',
       'authorName', 'authorId', 'text', 'likes', 'updatedAt'],
      dtype='object')
Classifying topics...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Reranking...
Selecting top 50 comments per feature...
Performing sentiment analysis...
Saving to: session/wireless over-ear headphones/youtube/processed_comments/Sony_WH_1000XM5.csv
DataFrame shape before saving: (191, 19)
Saved: Sony_WH_1000XM5.csv

Processing Focal_Bathys.csv...
Loaded CSV. Columns: Index(['product', 'category', 'id', 'videoId', 'parentId', 'isReply',
       'authorName', 'authorId', 'text', 'likes', 'updatedAt'],
      dtype='object')
Classifying topics...
Reranking...
Selecting top 50 comments per feature...
Performing sentiment analysis...
Saving to: session/wireless over-ear headphones/youtube/processed_comments/Focal_Bathys.csv
DataFrame shape before saving: (200, 19)
Saved: Focal_Bathys.csv

Processing Anker_Soundcore_Space_One.csv...
Loaded CSV. Columns: Index(['product', 'category', 'id', 'videoId', 'parentId', 'isReply',
       'authorName', 'authorId', 'text', 'likes', 'updatedAt'],
      dtype='object')
Classifying topics...
Reranking...
Selecting top 50 c

In [8]:
print("\nGenerating pivoted feature sentiment summary...")
summary_df = summarize_feature_scores(processed_data)
summary_out_path = DATA_DIR / "youtube" / "feature_youtube_comment_summary.csv"
summary_df.to_csv(summary_out_path, index=False)
print("Feature sentiment summary saved to:", summary_out_path)


Generating pivoted feature sentiment summary...
Feature sentiment summary saved to: session/wireless over-ear headphones/youtube/feature_youtube_comment_summary.csv
