# Notebook 3: Brand Insights & Report Generation

This notebook covers:
1. Brand sentiment comparison
2. Narrative theme extraction
3. Retail channel attribution
4. Sentiment trends over time
5. Generating the full HTML report
6. Shoe model intelligence
7. Brand mention distribution by subreddit
8. Purchase intent funnel
9. Brand sentiment trend over time
10. Posts vs comments per brand

> **Prerequisites:** `data/processed/annotated.parquet` must exist (run Notebook 2 or `make analyze`).


In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

In [None]:
annotated_path = Path('../data/processed/annotated.parquet')
if annotated_path.exists():
    df = pd.read_parquet(annotated_path)
    print(f'Loaded {len(df):,} annotated records')
else:
    # Generate synthetic demo data
    import random
    random.seed(42)
    brands_pool = [['Nike'], ['Adidas'], ['New Balance'], ['Hoka'], ['Under Armour'],
                   ['Nike', 'Adidas'], ['Li-Ning'], ['Puma'], ['Asics'], []]
    channels_pool = [['StockX'], ['GOAT'], ['Nike Direct'], ['Foot Locker'],
                     ['StockX', 'GOAT'], [], ['Amazon'], ['Grailed']]
    intents = ['completed_purchase', 'seeking_purchase', 'price_discussion',
               'availability_info', 'purchase_consideration', None, None, None]
    
    rows = []
    for i in range(500):
        brands = random.choice(brands_pool)
        sentiment = random.gauss(0.15, 0.4)
        rows.append({
            'id': f'r{i}', 'subreddit': random.choice(['Sneakers','Nike','Adidas','Running']),
            'record_type': 'post' if i < 250 else 'comment',
            'score': random.randint(1, 500),
            'created_utc': pd.Timestamp('2024-01-01', tz='UTC') + pd.Timedelta(hours=i*3),
            'full_text': f'Sample text about sneakers and comfort quality hype #{i}',
            'vader_score': sentiment, 'hybrid_score': sentiment,
            'transformer_score': None,
            'brands': brands,
            'channels': random.choice(channels_pool),
            'primary_intent': random.choice(intents),
            'all_intents': [],
        })
    df = pd.DataFrame(rows)
    print(f'Using synthetic demo data: {len(df):,} records')

## 1. Brand Sentiment Comparison

In [None]:
from reddit_sentiment.analysis.brand_comparison import BrandComparisonAnalyzer

analyzer = BrandComparisonAnalyzer()
metrics = analyzer.compute(df)
table = analyzer.comparison_table(df)

print(f'Brands detected: {list(metrics.keys())}')
display(table)

In [None]:
# Brand sentiment bar chart
brands_sorted = table.sort_values('avg_sentiment', ascending=True)
colours = ['#22c55e' if s > 0.05 else '#ef4444' if s < -0.05 else '#94a3b8'
           for s in brands_sorted['avg_sentiment']]

fig = go.Figure(go.Bar(
    x=brands_sorted['avg_sentiment'],
    y=brands_sorted['brand'],
    orientation='h',
    marker_color=colours,
    text=[f"{s:+.3f}" for s in brands_sorted['avg_sentiment']],
    textposition='outside'
))
fig.update_layout(
    title='Brand Sentiment Comparison',
    xaxis_title='Avg. Hybrid Sentiment Score',
    xaxis=dict(range=[-1, 1]),
    height=450, plot_bgcolor='white'
)
fig.add_vline(x=0, line_dash='dash', line_color='gray')
fig.show()

## 2. Narrative Theme Analysis

In [None]:
from reddit_sentiment.analysis.narrative import NarrativeThemeExtractor

extractor = NarrativeThemeExtractor()
narrative = extractor.extract(df)

theme_df = pd.DataFrame([
    {'theme': k, 'count': v, 'pct': narrative.theme_percentages.get(k, 0)}
    for k, v in sorted(narrative.theme_counts.items(), key=lambda x: x[1], reverse=True)
])
display(theme_df)

fig = px.bar(
    theme_df, x='pct', y='theme', orientation='h',
    title='Narrative Theme Frequency (% of corpus)',
    labels={'pct': '% of Posts', 'theme': 'Theme'}
)
fig.show()

In [None]:
# Top TF-IDF terms
if narrative.top_tfidf_terms:
    print('Top TF-IDF terms:')
    print(', '.join(narrative.top_tfidf_terms[:25]))

## 3. Retail Channel Attribution

In [None]:
from reddit_sentiment.analysis.channel_attribution import ChannelAttributionAnalyzer

ch_analyzer = ChannelAttributionAnalyzer()
attribution = ch_analyzer.analyze(df)

print('Channel Share:')
for ch, share in sorted(attribution.channel_share.items(), key=lambda x: x[1], reverse=True):
    print(f'  {ch}: {share:.1f}% ({attribution.channel_counts[ch]} mentions)')

# Pie chart
if attribution.channel_counts:
    ch_df = pd.DataFrame(list(attribution.channel_counts.items()), columns=['channel', 'count'])
    fig = px.pie(ch_df.head(8), values='count', names='channel',
                 title='Retail Channel Share', hole=0.3)
    fig.show()

## 4. Sentiment Trends

In [None]:
from reddit_sentiment.analysis.trends import SentimentTrendAnalyzer

trend_analyzer = SentimentTrendAnalyzer()
trends = trend_analyzer.analyze(df)

if not trends.monthly.empty:
    fig = px.line(
        trends.monthly, x='period', y='avg_sentiment',
        title='Monthly Average Sentiment',
        markers=True
    )
    fig.add_hline(y=0, line_dash='dash', line_color='gray')
    fig.update_layout(yaxis=dict(range=[-1, 1]))
    fig.show()
    display(trends.monthly)

## 5. Generate Full HTML Report

In [None]:
from reddit_sentiment.reporting.generator import ReportGenerator

reports_dir = Path('../data/reports')
generator = ReportGenerator(reports_dir=reports_dir)
html_path, md_path = generator.generate(df)

print(f'HTML Report: {html_path}')
print(f'Markdown:    {md_path}')
print('\nOpen the HTML report in your browser to see the full interactive dashboard!')

## 6. Shoe Model Intelligence

In [None]:
from reddit_sentiment.analysis.price_correlation import PriceCorrelationAnalyzer
import pandas as pd

corr = PriceCorrelationAnalyzer()
result = corr.analyze(df, pd.DataFrame())  # no eBay data yet

# Show models with enough mentions
top_models = result.summary_df[result.summary_df['mentions'] >= 5]
print(f'Shoe models with 5+ mentions: {len(top_models)}')
display(top_models)

In [None]:
from reddit_sentiment.reporting.charts import model_mentions_bar
import plotly.io as pio

if result.signals:
    fig = pio.from_json(model_mentions_bar(result.signals))
    fig.show()
else:
    print('No shoe model signals detected — run reddit-sentiment analyze first.')

### eBay Price Integration (Premium Column)

The `price_premium_%` column above is currently empty because no eBay data has been collected yet.

To populate it:
1. Set `EBAY_APP_ID` in `.env` (register free at [developer.ebay.com](https://developer.ebay.com))
2. Run `reddit-sentiment collect --ebay` to fetch sold listings for detected models
3. Re-run this cell — `avg_sold_price` and `price_premium_%` will be populated

Once collected, `PriceCorrelationAnalyzer` computes the Pearson correlation between Reddit sentiment scores and eBay resale premiums, revealing which hype signals actually predict resale value.

## 7. Brand Mention Distribution by Subreddit

Which brands dominate which communities?

In [None]:
# Explode brands so each (post, brand) pair is one row
brand_sub = df[df['brands'].map(lambda x: len(x) > 0)].copy()
brand_sub = brand_sub.explode('brands')
brand_sub = brand_sub[brand_sub['brands'].isin(metrics.keys())]

pivot = (
    brand_sub.groupby(['brands', 'subreddit'])
    .size()
    .unstack(fill_value=0)
)

fig = px.imshow(
    pivot,
    title='Brand Mention Heatmap by Subreddit',
    color_continuous_scale='Blues',
    labels={'color': 'Mentions'},
    aspect='auto',
    text_auto=True,
)
fig.update_layout(height=400)
fig.show()

## 8. Purchase Intent Funnel

How far along the purchase journey are Reddit users?

In [None]:
intent_df = pd.DataFrame([
    {'intent': k.replace('_', ' ').title(), 'count': v}
    for k, v in sorted(attribution.intent_funnel.items(), key=lambda x: x[1], reverse=True)
    if v > 0
])

colours = ['#6366f1', '#8b5cf6', '#a855f7', '#c084fc', '#e879f9', '#f0abfc', '#f5d0fe']

fig = go.Figure(go.Funnel(
    y=intent_df['intent'],
    x=intent_df['count'],
    textinfo='value+percent initial',
    marker=dict(color=colours[:len(intent_df)])
))
fig.update_layout(title='Purchase Intent Funnel', height=420)
fig.show()

## 9. Brand Sentiment Trend Over Time

Weekly sentiment trajectory for the top brands — which brands are gaining or losing momentum?

In [None]:
brand_time = df[df['brands'].map(lambda x: len(x) > 0)].copy()
brand_time = brand_time.explode('brands')

# Keep top 5 brands by mention count
top5 = table.head(5)['brand'].tolist()
brand_time = brand_time[brand_time['brands'].isin(top5)]

if 'created_utc' in brand_time.columns and not brand_time.empty:
    brand_time['week'] = (
        pd.to_datetime(brand_time['created_utc'])
        .dt.to_period('W')
        .dt.start_time
    )
    weekly_brand = (
        brand_time.groupby(['week', 'brands'])['hybrid_score']
        .mean()
        .reset_index()
    )
    fig = px.line(
        weekly_brand, x='week', y='hybrid_score', color='brands',
        title='Weekly Sentiment Trend by Brand (Top 5)',
        markers=True,
        labels={'hybrid_score': 'Avg. Sentiment', 'week': 'Week', 'brands': 'Brand'}
    )
    fig.add_hline(y=0, line_dash='dash', line_color='gray')
    fig.update_layout(yaxis=dict(range=[-1, 1]), height=450)
    fig.show()
else:
    print('No timestamp data available for trend analysis.')

## 10. Posts vs Comments per Brand

Are brands discussed more in post titles or in comment threads?

In [None]:
if 'record_type' in df.columns:
    brand_type = df[df['brands'].map(lambda x: len(x) > 0)].copy()
    brand_type = brand_type.explode('brands')
    brand_type = brand_type[brand_type['brands'].isin(metrics.keys())]

    breakdown = (
        brand_type.groupby(['brands', 'record_type'])
        .size()
        .reset_index(name='count')
    )
    # Sort brands by total mentions
    order = (
        breakdown.groupby('brands')['count']
        .sum()
        .sort_values(ascending=False)
        .index.tolist()
    )
    fig = px.bar(
        breakdown, x='brands', y='count', color='record_type',
        barmode='stack',
        category_orders={'brands': order},
        title='Posts vs Comments per Brand',
        labels={'count': 'Records', 'brands': 'Brand', 'record_type': 'Type'},
        color_discrete_map={'post': '#6366f1', 'comment': '#22c55e'}
    )
    fig.update_layout(height=420)
    fig.show()
else:
    print('record_type column not found.')