# 📓 YouTube Comment Analysis Notebook
This notebook processes YouTube comment CSVs, classifies topics with zero-shot learning, analyzes sentiment, reranks comments, and outputs summaries.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import json
import pandas as pd
from pathlib import Path
from transformers import pipeline
from tqdm.auto import tqdm
import traceback

tqdm.pandas()

In [None]:
with open("features.json", "r") as f:
    features_dict = json.load(f)
    features = features_dict["features"]

In [None]:
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="tabularisai/multilingual-sentiment-analysis"
)

In [None]:
def classify_topic(texts, batch_size=4):
    if isinstance(texts, str):
        texts = [texts]
    results = zero_shot_classifier(
        texts,
        candidate_labels=features,
        truncation=True,
        batch_size=batch_size
    )
    if isinstance(results, dict):
        results = [results]
    return [max(zip(r["scores"], r["labels"]))[1] for r in results]

def classify_sentiment(texts):
    texts = [text.strip() for text in texts]
    results = sentiment_classifier(texts, truncation=True, batch_size=16)
    return [r["label"] for r in results]

def chunk_apply(arr, func, chunk_size=16):
    return sum((func(arr[i:i+chunk_size]) for i in range(0, len(arr), chunk_size)), [])

def rerank_comments_df(df):
    df['text_length'] = df['text'].astype(str).str.len()
    df['likes'] = pd.to_numeric(df['likes'], errors='coerce').fillna(0)
    df['updatedAt'] = pd.to_datetime(df['updatedAt'], errors='coerce')

    min_time = df['updatedAt'].min()
    max_time = df['updatedAt'].max()
    df['recency_score'] = (df['updatedAt'] - min_time).dt.total_seconds() / (
        (max_time - min_time).total_seconds() + 1e-5
    )
    df['norm_likes'] = (df['likes'] - df['likes'].min()) / (df['likes'].max() - df['likes'].min() + 1e-5)
    df['norm_length'] = (df['text_length'] - df['text_length'].min()) / (df['text_length'].max() - df['text_length'].min() + 1e-5)
    df['quality_score'] = df['norm_likes'] + df['norm_length'] + df['recency_score']
    return df.sort_values(by=['features', 'quality_score'], ascending=[True, False]) if 'features' in df.columns else df.sort_values(by='quality_score', ascending=False)

sentiment_to_score = {
    "Very Negative": -2,
    "Negative": -1,
    "Neutral": 0,
    "Positive": 1,
    "Very Positive": 2
}

def senti_score(df, path, processed_data):
    df['senti_score'] = df['sentiment'].map(sentiment_to_score)
    out_path = path.parent / f"{path.stem}_processed.csv"
    print("Saving to:", out_path)
    print("DataFrame shape before saving:", df.shape)
    df.to_csv(out_path, index=False)
    print(f"Saved: {out_path.name}")
    processed_data.append((df, path))

def summarize_feature_scores(processed_data):
    feature_scores = []
    for df, path in processed_data:
        avg_scores = df.groupby('features')['senti_score'].mean()
        for feature, score in avg_scores.items():
            feature_scores.append({"product": path.stem, "feature": feature, "avg_senti_score": score})
    summary_df = pd.DataFrame(feature_scores)
    summary_df = summary_df.pivot(index='product', columns='feature', values='avg_senti_score').reset_index()
    return summary_df

In [None]:
# Set data directory (change path as needed)
data_dir = Path("Data").resolve()
print("Using data directory:", data_dir)
csv_paths = list(data_dir.glob("**/*.csv"))
print("CSV paths found:", csv_paths)

In [None]:
processed_data = []
for path in csv_paths:
    print(f"\nProcessing {path.name}...")
    try:
        df = pd.read_csv(path)
        print("Loaded CSV. Columns:", df.columns)

        required_columns = {'text', 'likes', 'updatedAt'}
        if not required_columns.issubset(df.columns):
            print(f"Missing columns in {path.name}: {required_columns - set(df.columns)}")
            continue

        df = df[df['text'].notnull()]
        df['updatedAt'] = pd.to_datetime(df['updatedAt'], errors='coerce')

        print("Classifying topics...")
        df["features"] = chunk_apply(df["text"].tolist(), classify_topic)

        print("Reranking...")
        df = rerank_comments_df(df)

        print("Selecting top 50 comments per feature...")
        df = df.groupby('features').head(50).reset_index(drop=True)

        print("Performing sentiment analysis...")
        df['sentiment'] = chunk_apply(df["text"].tolist(), classify_sentiment)

        senti_score(df, path, processed_data)

    except Exception as e:
        print(f"Error processing {path.name}: {e}")
        traceback.print_exc()

In [None]:
print("\nGenerating pivoted feature sentiment summary...")
summary_df = summarize_feature_scores(processed_data)
summary_out_path = data_dir / "feature_youtube_comment_summary.csv"
summary_df.to_csv(summary_out_path, index=False)
print("Feature sentiment summary saved to:", summary_out_path)