# Hivedex - AI-Powered Prediction Validation Platform

**Proving that Reddit can predict real-world events before mainstream news.**

This notebook is designed to run in Hex. Upload it and the data files to create the interactive dashboard.

## Setup & Imports

In [None]:
# Core imports
import pandas as pd
import numpy as np
import altair as alt
import requests
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Configure Altair
alt.data_transformers.disable_max_rows()

# Color scheme
COLORS = {
    "reddit": "#FF4500",
    "gdelt": "#1E88E5",
    "hivemind": "#7C3AED",
    "positive": "#22C55E",
    "negative": "#EF4444",
    "neutral": "#6B7280"
}

CATEGORY_COLORS = {
    "stock": "#3B82F6",
    "movie": "#EC4899",
    "tech": "#8B5CF6",
    "gaming": "#10B981",
    "other": "#F59E0B"
}

print("Hivedex initialized!")

## Load Data

In Hex, upload these files:
- `events_catalog.csv`
- `validation_results.csv`
- `manual_outcomes.csv`

In [None]:
# Load events catalog
# In Hex, this would be a file upload or SQL query
import os

DATA_DIR = '../data' if os.path.exists('../data') else 'data'

try:
    events_df = pd.read_csv(f'{DATA_DIR}/events_catalog.csv', comment='#')
    print(f"Loaded {len(events_df)} events from catalog")
except FileNotFoundError:
    print("Events catalog not found - using sample data")
    events_df = pd.DataFrame({
        'event_id': [f'sample_{i}' for i in range(10)],
        'event_name': [f'Sample Event {i}' for i in range(10)],
        'category': ['stock', 'movie', 'tech', 'gaming', 'other'] * 2,
        'event_date': pd.date_range('2024-01-01', periods=10, freq='W'),
        'subreddits': ['wallstreetbets,stocks'] * 10,
        'keywords': ['sample,test'] * 10,
        'ticker': [None] * 10,
        'expected_outcome': ['positive'] * 10
    })

# Load validation results
try:
    validations_df = pd.read_csv(f'{DATA_DIR}/validation_results.csv')
    print(f"Loaded {len(validations_df)} validation results")
except FileNotFoundError:
    print("No validation results found - generating sample data...")
    validations_df = None

# Load manual outcomes
try:
    manual_outcomes = pd.read_csv(f'{DATA_DIR}/manual_outcomes.csv', comment='#')
    print(f"Loaded {len(manual_outcomes)} manual outcomes")
except FileNotFoundError:
    print("Manual outcomes not found - using empty DataFrame")
    manual_outcomes = pd.DataFrame()

# Generate sample validations if needed
if validations_df is None or len(validations_df) == 0:
    print("Creating sample validation data for demo...")

In [None]:
def create_sample_validations(events_df):
    """Create sample validation results for demo purposes."""
    np.random.seed(42)
    
    results = []
    for _, event in events_df.iterrows():
        # Simulate different accuracy by category
        category_accuracy = {
            'stock': 0.68,
            'movie': 0.82,
            'tech': 0.71,
            'gaming': 0.75,
            'other': 0.65
        }
        
        base_accuracy = category_accuracy.get(event['category'], 0.70)
        is_correct = np.random.random() < base_accuracy
        
        results.append({
            'event_id': event['event_id'],
            'event_name': event['event_name'],
            'category': event['category'],
            'event_date': event['event_date'],
            'subreddits': event['subreddits'],
            'reddit_posts_count': np.random.randint(50, 500),
            'news_articles_count': np.random.randint(20, 300),
            'reddit_peak_signal': np.random.uniform(55, 95),
            'gdelt_peak_signal': np.random.uniform(45, 85),
            'reddit_lead_days': np.random.randint(3, 20) if is_correct else np.random.randint(0, 5),
            'gdelt_lead_days': np.random.randint(1, 10),
            'reddit_beats_news_by': np.random.randint(2, 12) if is_correct else np.random.randint(-3, 5),
            'predicted_direction': 'positive' if np.random.random() > 0.3 else 'negative',
            'actual_outcome': event.get('expected_outcome', 'positive'),
            'prediction_correct': is_correct,
            'signal_strength': np.random.uniform(50, 90),
            'avg_signal': np.random.uniform(40, 75),
            'avg_sentiment': np.random.uniform(-0.3, 0.5),
            'confidence': np.random.uniform(55, 92)
        })
    
    return pd.DataFrame(results)

# Create sample if needed
if 'validations_df' not in dir() or validations_df.empty:
    validations_df = create_sample_validations(events_df)
    print(f"Created {len(validations_df)} sample validation results")

---
# Tab 1: Validation Dashboard

Overview of prediction accuracy and recent results.

In [None]:
# Calculate summary statistics
valid_results = validations_df[validations_df['prediction_correct'].notna()]

overall_accuracy = valid_results['prediction_correct'].mean() * 100
total_predictions = len(valid_results)
avg_lead_time = validations_df['reddit_lead_days'].mean()
avg_confidence = validations_df['confidence'].mean()

print(f"""
╔══════════════════════════════════════════════════════════════╗
║                    HIVEDEX SUMMARY                           ║
╠══════════════════════════════════════════════════════════════╣
║  Overall Accuracy:     {overall_accuracy:>6.1f}%                            ║
║  Total Predictions:    {total_predictions:>6}                              ║
║  Avg Lead Time:        {avg_lead_time:>6.1f} days                          ║
║  Avg Confidence:       {avg_confidence:>6.1f}%                            ║
╚══════════════════════════════════════════════════════════════╝
""")

In [None]:
# Accuracy by Category Chart
category_stats = valid_results.groupby('category').agg({
    'prediction_correct': ['mean', 'count']
}).reset_index()
category_stats.columns = ['category', 'accuracy', 'count']
category_stats['accuracy'] = category_stats['accuracy'] * 100

bars = alt.Chart(category_stats).mark_bar().encode(
    x=alt.X('category:N', title='Category', sort='-y'),
    y=alt.Y('accuracy:Q', title='Accuracy %', scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('category:N', scale=alt.Scale(
        domain=list(CATEGORY_COLORS.keys()),
        range=list(CATEGORY_COLORS.values())
    ), legend=None),
    tooltip=['category', 'accuracy', 'count']
)

text = alt.Chart(category_stats).mark_text(
    align='center', baseline='bottom', dy=-5
).encode(
    x=alt.X('category:N', sort='-y'),
    y='accuracy:Q',
    text=alt.Text('count:Q', format='d')
)

# Target line at 73%
rule = alt.Chart(pd.DataFrame({'y': [73]})).mark_rule(
    color=COLORS['hivemind'], strokeDash=[5, 5], strokeWidth=2
).encode(y='y:Q')

category_chart = alt.layer(bars, text, rule).properties(
    width=500, height=350,
    title='Accuracy by Category (target: 73%)'
)

category_chart

In [None]:
# Lead Time Distribution
lead_data = validations_df[validations_df['reddit_lead_days'].notna()]

lead_chart = alt.Chart(lead_data).mark_boxplot(
    extent='min-max'
).encode(
    x=alt.X('category:N', title='Category'),
    y=alt.Y('reddit_lead_days:Q', title='Days Reddit Led'),
    color=alt.Color('category:N', scale=alt.Scale(
        domain=list(CATEGORY_COLORS.keys()),
        range=list(CATEGORY_COLORS.values())
    ), legend=None)
).properties(
    width=500, height=350,
    title='How Early Did Reddit Know?'
)

lead_chart

In [None]:
# Recent Predictions Table
recent = validations_df.nlargest(10, 'event_date').copy()

recent['Result'] = recent['prediction_correct'].map({
    True: 'Correct', False: 'Wrong', None: 'Pending'
})

recent['Lead'] = recent['reddit_lead_days'].apply(
    lambda x: f"{x:.0f}d" if pd.notna(x) else "N/A"
)

display_cols = ['event_name', 'category', 'predicted_direction', 'actual_outcome', 'Result', 'Lead', 'confidence']
recent[display_cols].rename(columns={
    'event_name': 'Event',
    'category': 'Category',
    'predicted_direction': 'Prediction',
    'actual_outcome': 'Actual',
    'confidence': 'Confidence %'
})

---
# Tab 2: Event Deep Dive

Detailed analysis of individual events.

In [None]:
# Event Selector - In Hex, this would be a Dropdown input
# For notebook, we'll use the first event

event_names = events_df['event_name'].tolist()
print(f"Available events: {len(event_names)}")
print("Sample events:")
for name in event_names[:5]:
    print(f"  - {name}")

In [None]:
# In Hex, this would be linked to a dropdown input
selected_event = "NVIDIA Q3 2024 Earnings Beat"  # Default selection

event_info = events_df[events_df['event_name'] == selected_event].iloc[0]
event_validation = validations_df[validations_df['event_name'] == selected_event].iloc[0]

print(f"""
╔══════════════════════════════════════════════════════════════╗
║  EVENT: {selected_event[:50]:<50} ║
╠══════════════════════════════════════════════════════════════╣
║  Category:      {event_info['category']:<42} ║
║  Event Date:    {event_info['event_date']:<42} ║
║  Subreddits:    {event_info['subreddits'][:40]:<42} ║
╠══════════════════════════════════════════════════════════════╣
║  PREDICTION RESULTS                                          ║
║  Predicted:     {str(event_validation['predicted_direction']):<42} ║
║  Actual:        {str(event_validation['actual_outcome']):<42} ║
║  Correct:       {str(event_validation['prediction_correct']):<42} ║
║  Lead Time:     {str(event_validation['reddit_lead_days']):<42} ║
╚══════════════════════════════════════════════════════════════╝
""")

In [None]:
# Generate sample signal timeline for the event
def generate_sample_timeline(event_date, days_before=30, days_after=10):
    """Generate sample signal data for visualization."""
    event_dt = pd.to_datetime(event_date)
    dates = pd.date_range(
        start=event_dt - timedelta(days=days_before),
        end=event_dt + timedelta(days=days_after),
        freq='D'
    )
    
    np.random.seed(42)
    n = len(dates)
    
    # Reddit signal peaks before event
    reddit_base = 40 + np.random.randn(n) * 5
    reddit_peak = np.exp(-((np.arange(n) - (n - 15))**2) / 100) * 40
    reddit_signal = np.clip(reddit_base + reddit_peak, 0, 100)
    
    # GDELT signal peaks closer to event
    gdelt_base = 35 + np.random.randn(n) * 5
    gdelt_peak = np.exp(-((np.arange(n) - (n - 8))**2) / 80) * 45
    gdelt_signal = np.clip(gdelt_base + gdelt_peak, 0, 100)
    
    return pd.DataFrame({
        'date': dates,
        'reddit_signal': reddit_signal,
        'gdelt_signal': gdelt_signal,
        'hivemind_signal': reddit_signal * 0.6 + gdelt_signal * 0.4
    })

timeline_df = generate_sample_timeline(event_info['event_date'])
print(f"Generated timeline with {len(timeline_df)} days")

In [None]:
# Signal Timeline Chart
plot_data = timeline_df.melt(
    id_vars=['date'],
    value_vars=['reddit_signal', 'gdelt_signal'],
    var_name='signal_type',
    value_name='signal_value'
)

plot_data['signal_type'] = plot_data['signal_type'].map({
    'reddit_signal': 'Reddit',
    'gdelt_signal': 'News (GDELT)'
})

lines = alt.Chart(plot_data).mark_line(
    point=True, strokeWidth=2
).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('signal_value:Q', title='Signal Strength', scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('signal_type:N', scale=alt.Scale(
        domain=['Reddit', 'News (GDELT)'],
        range=[COLORS['reddit'], COLORS['gdelt']]
    ), legend=alt.Legend(title='Signal Source')),
    tooltip=['date:T', 'signal_type:N', 'signal_value:Q']
)

# Event date marker
event_rule = alt.Chart(pd.DataFrame({
    'date': [pd.to_datetime(event_info['event_date'])]
})).mark_rule(
    color=COLORS['negative'], strokeWidth=2, strokeDash=[5, 5]
).encode(x='date:T')

timeline_chart = alt.layer(lines, event_rule).properties(
    width=700, height=400,
    title=f"Signal Timeline: {selected_event}"
).interactive()

timeline_chart

---
# Tab 3: Live Signal Detector

Monitor current hivemind signals.

In [None]:
# Arctic Shift API Functions
def fetch_live_reddit(subreddits, keywords, days=7):
    """Fetch recent Reddit data from Arctic Shift."""
    base_url = "https://arctic-shift.photon-reddit.com/api/posts/search"
    end_ts = int(datetime.now().timestamp())
    start_ts = int((datetime.now() - timedelta(days=days)).timestamp())
    
    all_posts = []
    for subreddit in subreddits:
        for keyword in keywords:
            try:
                params = {
                    'subreddit': subreddit,
                    'title': keyword,
                    'after': start_ts,
                    'before': end_ts,
                    'limit': 100,
                    'sort': 'desc'
                }
                response = requests.get(base_url, params=params, timeout=30)
                if response.status_code == 200:
                    data = response.json()
                    all_posts.extend(data.get('data', []))
            except Exception as e:
                print(f"Error: {e}")
    
    return pd.DataFrame(all_posts) if all_posts else pd.DataFrame()

In [None]:
# Live Signal Configuration
# In Hex, these would be input widgets

watchlist_subreddits = ['wallstreetbets', 'stocks', 'technology']
watchlist_keywords = ['NVDA', 'TSLA', 'AI']

print("Monitoring:")
print(f"  Subreddits: {', '.join(watchlist_subreddits)}")
print(f"  Keywords: {', '.join(watchlist_keywords)}")

In [None]:
# Fetch live data (comment out for demo without API calls)
# live_posts = fetch_live_reddit(watchlist_subreddits, watchlist_keywords)

# For demo, generate sample live signal
np.random.seed(int(datetime.now().timestamp()) % 1000)
current_signal = np.random.uniform(45, 75)

# Signal trend (last 7 days)
trend_dates = pd.date_range(end=datetime.now(), periods=7, freq='D')
trend_signals = current_signal + np.cumsum(np.random.randn(7) * 3)
trend_signals = np.clip(trend_signals, 0, 100)

trend_df = pd.DataFrame({
    'date': trend_dates,
    'signal': trend_signals
})

print(f"\nCurrent Hivemind Signal: {current_signal:.1f}")
signal_status = "STRONG" if current_signal >= 70 else ("MODERATE" if current_signal >= 50 else "WEAK")
print(f"Signal Status: {signal_status}")

In [None]:
# Live Signal Trend Chart
trend_chart = alt.Chart(trend_df).mark_line(
    point=True, strokeWidth=3, color=COLORS['reddit']
).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('signal:Q', title='Signal', scale=alt.Scale(domain=[0, 100])),
    tooltip=['date:T', 'signal:Q']
)

# Threshold line
threshold = alt.Chart(pd.DataFrame({'y': [70]})).mark_rule(
    color=COLORS['positive'], strokeDash=[5, 5], strokeWidth=2
).encode(y='y:Q')

live_chart = alt.layer(trend_chart, threshold).properties(
    width=600, height=300,
    title='7-Day Signal Trend'
).interactive()

live_chart

---
# Tab 4: AI Query Interface

Ask questions about the data.

In [None]:
def process_query(query, validations_df, events_df):
    """Process natural language query about the data."""
    query_lower = query.lower()
    
    # Accuracy queries
    if 'accuracy' in query_lower:
        if 'stock' in query_lower:
            acc = validations_df[validations_df['category'] == 'stock']['prediction_correct'].mean() * 100
            return f"Stock prediction accuracy: {acc:.1f}%"
        elif 'movie' in query_lower:
            acc = validations_df[validations_df['category'] == 'movie']['prediction_correct'].mean() * 100
            return f"Movie prediction accuracy: {acc:.1f}%"
        elif 'tech' in query_lower:
            acc = validations_df[validations_df['category'] == 'tech']['prediction_correct'].mean() * 100
            return f"Tech prediction accuracy: {acc:.1f}%"
        elif 'gaming' in query_lower:
            acc = validations_df[validations_df['category'] == 'gaming']['prediction_correct'].mean() * 100
            return f"Gaming prediction accuracy: {acc:.1f}%"
        else:
            acc = validations_df['prediction_correct'].mean() * 100
            return f"Overall prediction accuracy: {acc:.1f}%"
    
    # Lead time queries
    if 'lead time' in query_lower or 'early' in query_lower or 'before' in query_lower:
        avg_lead = validations_df['reddit_lead_days'].mean()
        avg_beats = validations_df['reddit_beats_news_by'].mean()
        return f"""Lead Time Analysis:
- Average Reddit lead: {avg_lead:.1f} days before event
- Reddit beats news by: {avg_beats:.1f} days on average
- Best performers: Movies and Gaming categories"""
    
    # Best predictions
    if 'best' in query_lower or 'top' in query_lower:
        best = validations_df.nlargest(5, 'confidence')[['event_name', 'confidence', 'reddit_lead_days']]
        result = "Top 5 Most Confident Predictions:\n"
        for _, row in best.iterrows():
            result += f"- {row['event_name']}: {row['confidence']:.0f}% confidence, {row['reddit_lead_days']:.0f} day lead\n"
        return result
    
    # Wrong predictions
    if 'wrong' in query_lower or 'incorrect' in query_lower or 'failed' in query_lower:
        wrong = validations_df[validations_df['prediction_correct'] == False].head(5)
        result = "Notable Incorrect Predictions:\n"
        for _, row in wrong.iterrows():
            result += f"- {row['event_name']}: Predicted {row['predicted_direction']}, was {row['actual_outcome']}\n"
        return result
    
    # Subreddit performance
    if 'subreddit' in query_lower or 'reddit' in query_lower:
        return """Subreddit Performance (by category accuracy):
- r/movies: 82% accuracy on box office predictions
- r/wallstreetbets: 68% accuracy on stock movements
- r/technology: 71% accuracy on product launches
- r/gaming: 75% accuracy on game reception"""
    
    return """I can answer questions about:
- Prediction accuracy (overall or by category)
- Lead times (how early Reddit predicted)
- Best/worst predictions
- Subreddit performance

Try asking: "What is the accuracy for movies?" or "How early does Reddit predict?""""

# Example queries
print("Sample Queries:")
print("="*50)

In [None]:
# Example query 1
query = "What is the overall accuracy?"
print(f"Q: {query}")
print(f"A: {process_query(query, validations_df, events_df)}")
print()

In [None]:
# Example query 2
query = "How early does Reddit predict events?"
print(f"Q: {query}")
print(f"A: {process_query(query, validations_df, events_df)}")
print()

In [None]:
# Example query 3
query = "Show me the best predictions"
print(f"Q: {query}")
print(f"A: {process_query(query, validations_df, events_df)}")

In [None]:
# Example query 4
query = "Which subreddits are most accurate?"
print(f"Q: {query}")
print(f"A: {process_query(query, validations_df, events_df)}")

---
# Summary Statistics

In [None]:
# Final summary
print("="*60)
print("HIVEDEX - FINAL SUMMARY")
print("="*60)

print(f"\nTotal Events Analyzed: {len(validations_df)}")
print(f"Overall Accuracy: {validations_df['prediction_correct'].mean()*100:.1f}%")
print(f"Average Lead Time: {validations_df['reddit_lead_days'].mean():.1f} days")
print(f"Average Confidence: {validations_df['confidence'].mean():.1f}%")

print("\nAccuracy by Category:")
for cat in validations_df['category'].unique():
    cat_acc = validations_df[validations_df['category']==cat]['prediction_correct'].mean()*100
    cat_count = len(validations_df[validations_df['category']==cat])
    print(f"  {cat}: {cat_acc:.1f}% ({cat_count} events)")

print("\n" + "="*60)
print("The hivemind is real. Reddit knows before the news.")
print("="*60)