In [None]:

# Comment Analysis for Reddit (Jupyter Notebook Version)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import json
import pandas as pd
from pathlib import Path
from transformers import pipeline
from tqdm.auto import tqdm
import traceback

# Enable tqdm for pandas
tqdm.pandas()

In [None]:

DATA_DIR = Path("session") / PRODUCT
META_PATH = DATA_DIR / "stage_1.json"
assert META_PATH.exists(), "Run 1_describe_product.ipynb first!"

# Load feature labels from JSON
with open(META_PATH, "r") as f:
    metadata = json.load(f)

features = [o["name"] for o in metadata["metrics"]]
display(features)


In [None]:

# Initialize compatible transformers pipelines
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="tabularisai/multilingual-sentiment-analysis"
)


In [None]:

# Classification function
def classify_topic(texts, batch_size=4):
    if isinstance(texts, str):
        texts = [texts]
    results = zero_shot_classifier(
        texts,
        candidate_labels=features,
        truncation=True,
        batch_size=batch_size
    )
    if isinstance(results, dict):
        results = [results]
    return [max(zip(r["scores"], r["labels"]))[1] for r in results]

# Sentiment scoring function
def classify_sentiment(texts):
    texts = [text.strip() for text in texts]
    results = sentiment_classifier(texts, truncation=True, batch_size=16)
    return [r["label"] for r in results]

# Utility: Chunked application of functions
def chunk_apply(arr, func, chunk_size=16):
    return sum((func(arr[i:i+chunk_size]) for i in range(0, len(arr), chunk_size)), [])

# Reranking function
def rerank_comments_df(df):
    df['comment_length'] = df['comment_body'].astype(str).str.len()
    min_time = df['comment_created_utc'].min()
    max_time = df['comment_created_utc'].max()
    df['recency_score'] = (df['comment_created_utc'] - min_time) / (max_time - min_time + 1e-5)
    df['norm_score'] = (df['comment_score'] - df['comment_score'].min()) / (df['comment_score'].max() - df['comment_score'].min() + 1e-5)
    df['norm_length'] = (df['comment_length'] - df['comment_length'].min()) / (df['comment_length'].max() - df['comment_length'].min() + 1e-5)
    df['quality_score'] = df['norm_score'] + df['norm_length'] + df['recency_score']
    return df.sort_values(by=['features', 'quality_score'], ascending=[True, False]) if 'features' in df.columns else df.sort_values(by='quality_score', ascending=False)

# Map sentiment labels to scores
sentiment_to_score = {
    "Very Negative": -2,
    "Negative": -1,
    "Neutral": 0,
    "Positive": 1,
    "Very Positive": 2
}

# Function to save processed data and sentiment mapping
def senti_score(df, path, processed_data):
    # Map sentiment to score
    df['senti_score'] = df['sentiment'].map(sentiment_to_score)

    # Save the processed data
    out_path = path.parent / f"{path.stem}_processed.csv"
    print("Saving to:", out_path)
    print("DataFrame shape before saving:", df.shape)
    df.to_csv(out_path, index=False)
    print(f"Saved: {out_path.name}")

    # Append the processed data
    processed_data.append((df, path))

# Summarize feature scores for all processed data
def summarize_feature_scores(processed_data):
    feature_scores = []
    for df, path in processed_data:
        avg_scores = df.groupby('features')['senti_score'].mean()
        for feature, score in avg_scores.items():
            feature_scores.append({"product": path.stem, "feature": feature, "avg_senti_score": score})
    summary_df = pd.DataFrame(feature_scores)
    summary_df = summary_df.pivot(index='product', columns='feature', values='avg_senti_score').reset_index()
    return summary_df

In [None]:

# Set data directory path here for notebook use (instead of argparse)
data_dir = Path("data").resolve()
print("Using data directory:", data_dir)
csv_paths = list(data_dir.glob("**/*.csv"))
print("CSV paths found:", csv_paths)

In [None]:

# Loop through each CSV file
processed_data = []
for path in csv_paths:
    print(f"\nProcessing {path.name}...")
    try:
        df = pd.read_csv(path)
        print("Loaded CSV. Columns:", df.columns)

        required_columns = {'comment_body', 'comment_created_utc', 'comment_score'}
        if not required_columns.issubset(df.columns):
            print(f"Missing columns in {path.name}: {required_columns - set(df.columns)}")
            continue

        print("Classifying topics...")
        df["features"] = chunk_apply(df["comment_body"].tolist(), classify_topic)

        print("Reranking...")
        df = rerank_comments_df(df)

        print("Selecting top 50 comments per feature...")
        df = df.groupby('features').head(50).reset_index(drop=True)

        print("Performing sentiment analysis...")
        df['sentiment'] = chunk_apply(df["comment_body"].tolist(), classify_sentiment)

        # Save and append processed data
        senti_score(df, path, processed_data)

    except Exception as e:
        print(f"Error processing {path.name}: {e}")
        traceback.print_exc()

In [None]:

# After processing all files, summarize results
print("\nGenerating pivoted feature sentiment summary...")
summary_df = summarize_feature_scores(processed_data)
summary_out_path = data_dir / "feature_summary.csv"
summary_df.to_csv(summary_out_path, index=False)
print("Feature sentiment summary saved to:", summary_out_path)
