# Notebook 2: Sentiment Analysis

This notebook demonstrates:
1. Running the full annotation pipeline (VADER + optional transformer)
2. Exploring sentiment distributions
3. Brand detection and context extraction examples
4. Purchase intent signals

> **Prerequisites:** `data/raw/posts_*.parquet` must exist (run `make collect` first).  
> For transformer scoring: `uv sync --extra ml` then re-run.

In [1]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

## 1. Load Raw Data

In [2]:
from reddit_sentiment.collection.collector import SubredditCollector

raw_dir = Path('../data/raw')
try:
    raw_df = SubredditCollector.load_latest(raw_dir)
    print(f'Loaded {len(raw_df):,} raw records')
except FileNotFoundError:
    # Demo synthetic data
    raw_df = pd.DataFrame({
        'id': [f'r{i}' for i in range(200)],
        'subreddit': ['Sneakers'] * 100 + ['Nike'] * 60 + ['Adidas'] * 40,
        'record_type': ['post'] * 120 + ['comment'] * 80,
        'score': [50] * 200,
        'created_utc': pd.date_range('2024-01-01', periods=200, freq='3H', tz='UTC'),
        'full_text': [
            'I just copped the Nike Air Max and they are amazing quality!',
            'Adidas Yeezy resale prices are insane, way too expensive',
            'Where to cop the New Balance 990v4? W2C?',
            'Hoka Clifton runs are so comfortable for marathon training',
            'The Three Stripes collab with Pharrell is fire',
        ] * 40,
        'extracted_urls': [['https://stockx.com/buy/nike-air-max']] * 100 + [[]] * 100,
    })
    print('Using synthetic demo data.')

Using synthetic demo data.


  'created_utc': pd.date_range('2024-01-01', periods=200, freq='3H', tz='UTC'),


## 2. Run Annotation Pipeline

In [3]:
from reddit_sentiment.sentiment.pipeline import SentimentPipeline

# Use VADER-only mode (no transformer download needed)
# Set use_transformer=True if you have ML extras installed
pipeline = SentimentPipeline(use_transformer=False)

annotated = pipeline.annotate(raw_df)
print(f'Annotated {len(annotated):,} records')
print(f'New columns: {[c for c in annotated.columns if c not in raw_df.columns]}')
annotated[['id', 'vader_score', 'hybrid_score', 'brands', 'channels', 'primary_intent']].head(10)

Annotated 200 records
New columns: ['vader_score', 'transformer_score', 'hybrid_score', 'brands', 'models', 'channels', 'primary_intent', 'all_intents']


Unnamed: 0,id,vader_score,hybrid_score,brands,channels,primary_intent
0,r0,0.6239,0.6239,[Nike],[StockX],completed_purchase
1,r1,-0.4019,-0.4019,[Adidas],[StockX],
2,r2,0.0,0.0,[New Balance],[StockX],seeking_purchase
3,r3,0.6418,0.6418,[Hoka],[StockX],
4,r4,-0.34,-0.34,[Adidas],[StockX],
5,r5,0.6239,0.6239,[Nike],[StockX],completed_purchase
6,r6,-0.4019,-0.4019,[Adidas],[StockX],
7,r7,0.0,0.0,[New Balance],[StockX],seeking_purchase
8,r8,0.6418,0.6418,[Hoka],[StockX],
9,r9,-0.34,-0.34,[Adidas],[StockX],


## 3. Sentiment Distribution

In [4]:
fig = px.histogram(
    annotated, x='hybrid_score', nbins=40,
    title='Hybrid Sentiment Score Distribution',
    labels={'hybrid_score': 'Sentiment Score (-1 to +1)', 'count': 'Records'},
    color_discrete_sequence=['#4f46e5']
)
fig.add_vline(x=0, line_dash='dash', line_color='gray')
fig.show()

print(f"Mean sentiment: {annotated['hybrid_score'].mean():.4f}")
print(f"Positive (>0.05): {(annotated['hybrid_score'] > 0.05).mean()*100:.1f}%")
print(f"Negative (<-0.05): {(annotated['hybrid_score'] < -0.05).mean()*100:.1f}%")

Mean sentiment: 0.1048
Positive (>0.05): 40.0%
Negative (<-0.05): 40.0%


## 4. Brand Detection Examples

In [5]:
from reddit_sentiment.detection.brands import BrandDetector

detector = BrandDetector(context_window=10)

examples = [
    'The Three Stripes collab with Beyoncé is unmatched',
    'Way of Wade 10 just dropped and the colorway is fire',
    'UA Curry shoes are underrated for basketball courts',
    'NB 990v4 vs Nike Dunk — which is the better retro?',
]

for text in examples:
    brands = detector.detect_brands(text)
    print(f'Text: "{text}"')
    print(f'  → Brands: {brands}\n')

Text: "The Three Stripes collab with Beyoncé is unmatched"
  → Brands: ['Adidas']

Text: "Way of Wade 10 just dropped and the colorway is fire"
  → Brands: ['Li-Ning']

Text: "UA Curry shoes are underrated for basketball courts"
  → Brands: ['Under Armour']

Text: "NB 990v4 vs Nike Dunk — which is the better retro?"
  → Brands: ['New Balance', 'Nike']



In [6]:
# Brands by mention count
brand_counts = annotated.explode('brands')['brands'].value_counts().reset_index()
brand_counts.columns = ['brand', 'mentions']
brand_counts = brand_counts[brand_counts['brand'].notna() & (brand_counts['brand'] != '')]

fig = px.bar(
    brand_counts.head(10), x='brand', y='mentions',
    title='Brand Mention Frequency',
    color='mentions', color_continuous_scale='Viridis'
)
fig.show()

## 5. Purchase Intent Analysis

In [7]:
intent_counts = annotated['primary_intent'].value_counts().reset_index()
intent_counts.columns = ['intent', 'count']
intent_counts = intent_counts[intent_counts['intent'].notna()]

fig = px.funnel(
    intent_counts.head(7),
    x='count', y='intent',
    title='Purchase Intent Signal Distribution'
)
fig.show()

print(f"Records with any intent signal: {annotated['primary_intent'].notna().sum():,}")
print(f"Records without intent: {annotated['primary_intent'].isna().sum():,}")

Records with any intent signal: 80
Records without intent: 120


In [8]:
# Sentiment by intent type
intent_sentiment = (
    annotated[annotated['primary_intent'].notna()]
    .groupby('primary_intent')['hybrid_score']
    .mean()
    .sort_values()
    .reset_index()
)
intent_sentiment.columns = ['intent', 'avg_sentiment']

colours = ['#ef4444' if s < 0 else '#22c55e' for s in intent_sentiment['avg_sentiment']]
fig = go.Figure(go.Bar(
    x=intent_sentiment['avg_sentiment'],
    y=intent_sentiment['intent'],
    orientation='h',
    marker_color=colours
))
fig.update_layout(title='Average Sentiment by Intent Type', xaxis=dict(range=[-1, 1]))
fig.show()

## 6. Posts vs Comments: Sentiment Comparison

Reddit comments tend to be more emotionally reactive than posts. Posts are often structured questions or reviews; comments are rapid, context-driven responses.

In [9]:
if 'record_type' in annotated.columns and annotated['record_type'].nunique() > 1:
    # Distribution overlay
    fig = px.histogram(
        annotated, x='hybrid_score', color='record_type',
        nbins=40, barmode='overlay', opacity=0.75,
        title='Sentiment Distribution: Posts vs Comments',
        labels={'hybrid_score': 'Hybrid Sentiment Score', 'record_type': 'Type'},
        color_discrete_map={'post': '#4f46e5', 'comment': '#06b6d4'},
    )
    fig.add_vline(x=0, line_dash='dash', line_color='gray')
    fig.show()

    # Summary table
    comparison = (
        annotated.groupby('record_type')['hybrid_score']
        .agg(count='count', mean='mean', std='std')
        .rename(columns={'count': 'records', 'mean': 'avg_sentiment', 'std': 'std_dev'})
        .round(4)
    )
    print("Sentiment summary by record type:")
    display(comparison)
else:
    print("Only one record type present — run 'reddit-sentiment collect --public' to include comments.")

Sentiment summary by record type:


Unnamed: 0_level_0,records,avg_sentiment,std_dev
record_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
comment,80,0.1048,0.4553
post,120,0.1048,0.4543


## 7. Sentiment by Subreddit

Different subreddits have different emotional baselines. r/Jordans skews positive (brand fans); r/SneakerMarket is more neutral (transactional). Comparing subreddit sentiment helps separate platform bias from genuine brand signal.

In [10]:
if 'subreddit' in annotated.columns:
    sub_sent = (
        annotated.groupby('subreddit')['hybrid_score']
        .agg(avg_sentiment='mean', records='count')
        .sort_values('avg_sentiment', ascending=False)
        .reset_index()
        .round(4)
    )

    colours = ['#22c55e' if s > 0.05 else '#ef4444' if s < -0.05 else '#94a3b8'
               for s in sub_sent['avg_sentiment']]

    fig = go.Figure(go.Bar(
        x=sub_sent['avg_sentiment'],
        y=sub_sent['subreddit'],
        orientation='h',
        marker_color=colours,
        text=[f"{s:+.3f} ({n:,} records)" for s, n in
              zip(sub_sent['avg_sentiment'], sub_sent['records'])],
        textposition='outside',
    ))
    fig.add_vline(x=0, line_dash='dash', line_color='gray')
    fig.update_layout(
        title='Average Sentiment by Subreddit',
        xaxis=dict(range=[-0.5, 0.7]),
        height=400, plot_bgcolor='white',
    )
    fig.show()

## 8. Shoe Model Detection

Beyond brands, the pipeline identifies specific shoe models (Air Jordan 1, Dunk Low, Yeezy 350, etc.) using alias-based pattern matching. This powers the price correlation analysis in Notebook 3.

In [11]:
from reddit_sentiment.detection.models import ModelDetector

detector = ModelDetector()

examples = [
    'Just picked up the AJ1 Bred Toe — worth every penny at retail',
    'Dunk Low pandas are finally restocking on Nike SNKRS',
    'My 990v6 is the comfiest shoe I have ever run in, beats UB23',
    'Yeezy 350 V2 Zebra at retail? No way, minimum 2x resale',
]

print("Model detection examples:\n")
for text in examples:
    models = detector.detect_models(text)
    mentions = detector.detect(text)
    print(f'  Text: "{text}"')
    for m in mentions:
        print(f'    → {m.model} (matched alias: "{m.alias}", retail: ${m.retail_price:.0f})')
    print()

# Models found in the annotated dataset
if 'models' in annotated.columns:
    from collections import Counter
    all_models = [m for models in annotated['models'].dropna() for m in (models if isinstance(models, list) else [])]
    model_counts = pd.DataFrame(Counter(all_models).most_common(15), columns=['model', 'mentions'])
    model_counts = model_counts[model_counts['model'].astype(str).str.strip().ne('')]
    if not model_counts.empty:
        fig = px.bar(
            model_counts, x='mentions', y='model', orientation='h',
            title='Top Shoe Models by Reddit Mentions',
            color='mentions', color_continuous_scale='Purples',
        )
        fig.show()
    else:
        print("No shoe model detections found in annotated data — ensure full_text column is populated.")

Model detection examples:

  Text: "Just picked up the AJ1 Bred Toe — worth every penny at retail"
    → Air Jordan 1 (matched alias: "AJ1", retail: $180)

  Text: "Dunk Low pandas are finally restocking on Nike SNKRS"
    → Dunk Low (matched alias: "Dunk Low", retail: $110)

  Text: "My 990v6 is the comfiest shoe I have ever run in, beats UB23"
    → NB 990 (matched alias: "990v6", retail: $185)
    → Ultraboost (matched alias: "UB23", retail: $190)

  Text: "Yeezy 350 V2 Zebra at retail? No way, minimum 2x resale"
    → Yeezy 350 (matched alias: "Yeezy 350", retail: $230)



## 9. Save Annotated Data

In [12]:
out_path = Path('../data/processed/annotated.parquet')
out_path.parent.mkdir(parents=True, exist_ok=True)
annotated.to_parquet(out_path, index=False)
print(f'Saved: {out_path}')

Saved: ../data/processed/annotated.parquet
