# Web Scraping Pipeline with FlowerPower

**Execution:** `uvx --with "flowerpower[rq],requests>=2.28.0,beautifulsoup4>=4.11.0,pandas>=2.0.0,matplotlib,seaborn" jupyter lab`

This notebook demonstrates web scraping using FlowerPower's JobQueueManager.

## Quick Start

In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
from collections import Counter

# Add FlowerPower source to path
sys.path.insert(0, str(Path().absolute().parents[2] / "src"))

from flowerpower.flowerpower import FlowerPowerProject

# Initialize project
project = FlowerPowerProject.load(".")

print("🌐 FlowerPower Web Scraping Pipeline")
print(f"📁 Project: {project.pipeline_manager.project_cfg.name}")
print(f"🎯 Pipeline: news_scraper")
print(f"⏰ Scrape time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

  from .autonotebook import tqdm as notebook_tqdm
2025-09-26 16:43:18,899	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


🌐 FlowerPower Web Scraping Pipeline
📁 Project: web-scraping-pipeline
🎯 Pipeline: news_scraper
⏰ Scrape time: 2025-09-26 16:43:18


In [2]:
# Quick scraping execution
result = project.pipeline_manager.run(
    "news_scraper",
    inputs={"scrape_timestamp": datetime.now().isoformat()},
    final_vars=["processed_articles"]
)

print("✅ News scraping completed!")
if "processed_articles" in result:
    info = result["processed_articles"]
    print(f"📄 Articles saved to: {info['output_file']}")
    print(f"📊 Total articles: {info['total_articles']}")
    print(f"🌐 Sources: {info['unique_sources']}")
    print(f"📈 Average length: {info['average_content_length']:.0f} chars")

[32m2025-09-26 16:43:20.769[0m | [1mINFO    [0m | [36mflowerpower.pipeline.pipeline[0m:[36m_execute_with_retry[0m:[36m223[0m - [1m🚀 Running pipeline 'news_scraper' (attempt 1/4)[0m
[32m2025-09-26 16:43:25.523[0m | [32m[1mSUCCESS [0m | [36mflowerpower.pipeline.pipeline[0m:[36m_execute_with_retry[0m:[36m232[0m - [32m[1m✅ Pipeline 'news_scraper' completed successfully in 4 seconds[0m


✅ News scraping completed!
📄 Articles saved to: /home/volker/coding/flowerpower/.worktree/code-simplification-analysis/examples/web-scraping-pipeline/output/articles_20250926_164325.json
📊 Total articles: 0
🌐 Sources: 0
📈 Average length: 0 chars


## 1. Scraped Data Analysis

In [3]:
# Load and analyze scraped news data
data_file = "data/news_articles.csv"

if Path(data_file).exists():
    df = pd.read_csv(data_file)
    print(f"📊 News Dataset Overview")
    print(f"📈 Total articles: {len(df):,}")
    print(f"📰 Columns: {list(df.columns)}")
    
    if 'published_date' in df.columns:
        df['published_date'] = pd.to_datetime(df['published_date'])
        print(f"📅 Date range: {df['published_date'].min()} to {df['published_date'].max()}")
    
    # Display sample articles
    print("\n🔍 Sample Articles:")
    display(df.head())
    
    # Basic statistics
    print("\n📊 Content Statistics:")
    if 'content' in df.columns:
        df['content_length'] = df['content'].str.len()
        print(f"   • Average content length: {df['content_length'].mean():.0f} characters")
        print(f"   • Longest article: {df['content_length'].max():,} characters")
        print(f"   • Shortest article: {df['content_length'].min():,} characters")
    
    if 'source' in df.columns:
        source_counts = df['source'].value_counts()
        print(f"\n🌐 Sources ({len(source_counts)} unique):")
        for source, count in source_counts.head(5).items():
            print(f"   • {source}: {count} articles")
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Content length distribution
    if 'content_length' in df.columns:
        df['content_length'].hist(bins=30, ax=axes[0, 0], alpha=0.7)
        axes[0, 0].set_title('Article Length Distribution')
        axes[0, 0].set_xlabel('Content Length (characters)')
        axes[0, 0].set_ylabel('Frequency')
    
    # Articles by source
    if 'source' in df.columns:
        top_sources = df['source'].value_counts().head(8)
        top_sources.plot(kind='bar', ax=axes[0, 1], color='lightblue')
        axes[0, 1].set_title('Articles by Source')
        axes[0, 1].set_xlabel('Source')
        axes[0, 1].set_ylabel('Article Count')
    
    # Articles by date
    if 'published_date' in df.columns:
        daily_counts = df.groupby(df['published_date'].dt.date).size()
        axes[1, 0].plot(daily_counts.index, daily_counts.values, marker='o')
        axes[1, 0].set_title('Articles Over Time')
        axes[1, 0].set_xlabel('Date')
        axes[1, 0].set_ylabel('Article Count')
        axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Content length vs source
    if 'content_length' in df.columns and 'source' in df.columns:
        avg_length_by_source = df.groupby('source')['content_length'].mean().sort_values(ascending=False).head(8)
        avg_length_by_source.plot(kind='barh', ax=axes[1, 1], color='lightgreen')
        axes[1, 1].set_title('Average Content Length by Source')
        axes[1, 1].set_xlabel('Average Content Length')
    
    plt.tight_layout()
    plt.show()
    
else:
    print(f"⚠️ Data file not found: {data_file}")
    print("💡 Run the scraping pipeline first to generate data")

⚠️ Data file not found: data/news_articles.csv
💡 Run the scraping pipeline first to generate data


## 2. Pipeline Configuration Experiments

In [None]:
# Experiment with different scraping configurations
experiments = [
    {
        "name": "Quick Scrape",
        "config": {
            "max_articles": 10,
            "request_delay": 0.5,
            "timeout": 10
        }
    },
    {
        "name": "Deep Scrape",
        "config": {
            "max_articles": 50,
            "request_delay": 2.0,
            "timeout": 30,
            "extract_keywords": True,
            "sentiment_analysis": True
        }
    },
    {
        "name": "Tech Focus",
        "config": {
            "categories": ["technology", "ai", "software"],
            "max_articles": 25,
            "language_filter": "en",
            "extract_keywords": True
        }
    }
]

experiment_results = []

print("🧪 Running Scraping Experiments")
print("==============================")

for exp in experiments:
    print(f"\n🔄 {exp['name']} experiment...")
    
    # Add scrape timestamp to config
    config = exp['config'].copy()
    config['scrape_timestamp'] = datetime.now().isoformat()
    
    try:
        result = project.pipeline_manager.run(
            "news_scraper",
            inputs=config,
            final_vars=["processed_articles"]
        )
        
        if "processed_articles" in result:
            info = result["processed_articles"]
            experiment_results.append({
                "name": exp['name'],
                "total_articles": info['total_articles'],
                "unique_sources": info['unique_sources'],
                "avg_length": info['average_content_length']
            })
            
            print(f"   ✅ Articles: {info['total_articles']}, Sources: {info['unique_sources']}")
        else:
            print(f"   ❌ Experiment failed")
            
    except Exception as e:
        print(f"   ❌ Error: {e}")

# Compare experiment results
if experiment_results:
    print("\n📊 Experiment Comparison")
    print("========================")
    
    results_df = pd.DataFrame(experiment_results)
    display(results_df)
    
    # Visualize comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Articles comparison
    axes[0].bar(results_df['name'], results_df['total_articles'], color='skyblue')
    axes[0].set_title('Articles Scraped')
    axes[0].set_ylabel('Article Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Sources comparison
    axes[1].bar(results_df['name'], results_df['unique_sources'], color='lightgreen')
    axes[1].set_title('Unique Sources')
    axes[1].set_ylabel('Source Count')
    axes[1].tick_params(axis='x', rotation=45)
    
    # Average length comparison
    axes[2].bar(results_df['name'], results_df['avg_length'], color='lightcoral')
    axes[2].set_title('Average Content Length')
    axes[2].set_ylabel('Characters')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Find best experiment
    best_exp = results_df.loc[results_df['total_articles'].idxmax()]
    print(f"\n🏆 Most articles: {best_exp['name']} ({best_exp['total_articles']} articles)")

## 3. Content Analysis and Processing

In [None]:
# Run comprehensive content analysis
analysis_result = project.pipeline_manager.run(
    "news_scraper",
    inputs={
        "scrape_timestamp": datetime.now().isoformat(),
        "max_articles": 100,
        "extract_keywords": True,
        "sentiment_analysis": True,
        "language_detection": True,
        "content_classification": True
    },
    final_vars=[
        "processed_articles",
        "content_analysis",
        "keyword_summary",
        "sentiment_summary"
    ]
)

print("🔍 Content Analysis Results")
print("===========================")

# Processed articles info
if "processed_articles" in analysis_result:
    articles = analysis_result["processed_articles"]
    print(f"\n📄 Processed Articles:")
    print(f"   • Total articles: {articles['total_articles']}")
    print(f"   • Output file: {articles['output_file']}")
    print(f"   • Processing time: {articles.get('processing_time', 'N/A')}s")

# Content analysis
if "content_analysis" in analysis_result:
    analysis = analysis_result["content_analysis"]
    print(f"\n📊 Content Analysis:")
    print(f"   • Total words: {analysis['total_words']:,}")
    print(f"   • Unique words: {analysis['unique_words']:,}")
    print(f"   • Average words per article: {analysis['avg_words_per_article']:.0f}")
    print(f"   • Languages detected: {analysis['languages_detected']}")

# Keyword analysis
if "keyword_summary" in analysis_result:
    keywords = analysis_result["keyword_summary"]
    print(f"\n🔑 Keyword Analysis:")
    print(f"   • Total keywords: {keywords['total_keywords']}")
    print(f"   • Unique keywords: {keywords['unique_keywords']}")
    
    # Show top keywords
    if 'top_keywords' in keywords:
        print(f"   • Top 10 keywords:")
        for i, (keyword, count) in enumerate(keywords['top_keywords'][:10], 1):
            print(f"     {i:2d}. {keyword}: {count} occurrences")

# Sentiment analysis
if "sentiment_summary" in analysis_result:
    sentiment = analysis_result["sentiment_summary"]
    print(f"\n😊 Sentiment Analysis:")
    print(f"   • Average sentiment: {sentiment['average_sentiment']:.3f}")
    print(f"   • Positive articles: {sentiment['positive_count']} ({sentiment['positive_percentage']:.1f}%)")
    print(f"   • Negative articles: {sentiment['negative_count']} ({sentiment['negative_percentage']:.1f}%)")
    print(f"   • Neutral articles: {sentiment['neutral_count']} ({sentiment['neutral_percentage']:.1f}%)")

# Create visualizations
if all(key in analysis_result for key in ["keyword_summary", "sentiment_summary"]):
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Top keywords
    if 'top_keywords' in analysis_result["keyword_summary"]:
        top_keywords = analysis_result["keyword_summary"]["top_keywords"][:15]
        keyword_names = [k[0] for k in top_keywords]
        keyword_counts = [k[1] for k in top_keywords]
        
        axes[0, 0].barh(range(len(keyword_names)), keyword_counts, color='lightblue')
        axes[0, 0].set_yticks(range(len(keyword_names)))
        axes[0, 0].set_yticklabels(keyword_names)
        axes[0, 0].set_title('Top 15 Keywords')
        axes[0, 0].set_xlabel('Frequency')
    
    # Sentiment distribution
    sentiment_data = analysis_result["sentiment_summary"]
    sentiment_labels = ['Positive', 'Negative', 'Neutral']
    sentiment_values = [sentiment_data['positive_percentage'], 
                       sentiment_data['negative_percentage'], 
                       sentiment_data['neutral_percentage']]
    
    colors = ['lightgreen', 'lightcoral', 'lightyellow']
    axes[0, 1].pie(sentiment_values, labels=sentiment_labels, autopct='%1.1f%%', colors=colors)
    axes[0, 1].set_title('Sentiment Distribution')
    
    # Sentiment over time (simulated)
    if "processed_articles" in analysis_result:
        # Simulate sentiment over time
        dates = pd.date_range(start=datetime.now() - pd.Timedelta(days=7), periods=8, freq='D')
        positive_sentiment = [0.6, 0.55, 0.65, 0.7, 0.62, 0.58, 0.64, 0.67]
        negative_sentiment = [0.25, 0.3, 0.2, 0.15, 0.23, 0.27, 0.21, 0.18]
        
        axes[1, 0].plot(dates, positive_sentiment, 'g-', label='Positive', marker='o')
        axes[1, 0].plot(dates, negative_sentiment, 'r-', label='Negative', marker='s')
        axes[1, 0].set_title('Sentiment Trend (Last 7 Days)')
        axes[1, 0].set_xlabel('Date')
        axes[1, 0].set_ylabel('Sentiment Score')
        axes[1, 0].legend()
        axes[1, 0].tick_params(axis='x', rotation=45)
        axes[1, 0].grid(True, alpha=0.3)
    
    # Content categories (simulated)
    categories = ['Technology', 'Business', 'Politics', 'Science', 'Health', 'Sports']
    category_counts = [35, 28, 22, 18, 15, 12]
    
    axes[1, 1].pie(category_counts, labels=categories, autopct='%1.1f%%')
    axes[1, 1].set_title('Content Categories')
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ Insufficient data for visualizations")

## 4. Background Job Queue Processing

In [None]:
print("🚀 Background Scraping Jobs")

# Single background job
print("\n📥 Enqueueing single scraping job...")
try:
    job = project.pipeline_manager.enqueue(
        "news_scraper",
        inputs={
            "scrape_timestamp": datetime.now().isoformat(),
            "max_concurrent_requests": 8,
            "request_delay": 1.0,
            "extract_keywords": True,
            "sentiment_analysis": True
        },
        final_vars=["processed_articles"],
        queue_name="scraping"
    )
    
    print(f"   ✅ Job enqueued: {job.id}")
    print(f"   📋 Queue: {job.origin}")
    print(f"   ⏰ Enqueued at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
except Exception as e:
    print(f"   ❌ Enqueue failed: {e}")
    print("   💡 Requires Redis for background processing")

# Batch scraping jobs
print("\n📦 Enqueueing batch scraping jobs...")

batch_configs = [
    {
        "name": "tech_news",
        "config": {
            "categories": ["technology", "ai", "software"],
            "max_articles": 50,
            "extract_keywords": True
        }
    },
    {
        "name": "business_news",
        "config": {
            "categories": ["business", "finance", "market"],
            "max_articles": 30,
            "sentiment_analysis": True
        }
    },
    {
        "name": "science_news",
        "config": {
            "categories": ["science", "research", "innovation"],
            "max_articles": 25,
            "language_detection": True
        }
    }
]

batch_jobs = []
for batch in batch_configs:
    print(f"\n   🔄 {batch['name']} scraping...")
    
    try:
        config = batch['config'].copy()
        config['scrape_timestamp'] = datetime.now().isoformat()
        
        job = project.pipeline_manager.enqueue(
            "news_scraper",
            inputs=config,
            final_vars=["processed_articles"],
            queue_name="scraping",
            job_id=f"scrape_{batch['name']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        )
        
        batch_jobs.append((batch['name'], job))
        print(f"     ✅ Enqueued: {job.id}")
        print(f"     📊 Target: {config.get('max_articles', 'unlimited')} articles")
        
    except Exception as e:
        print(f"     ❌ Failed: {e}")

if batch_jobs:
    print(f"\n🎉 Successfully enqueued {len(batch_jobs)} batch jobs!")
    print("\n🚀 To process these jobs, start workers:")
    print("   flowerpower job-queue start-worker --queue-names scraping")
    
    # Create batch jobs summary
    batch_df = pd.DataFrame([
        {
            "Job Name": name,
            "Job ID": job.id,
            "Queue": job.origin,
            "Status": "Queued"
        }
        for name, job in batch_jobs
    ])
    
    print("\n📋 Batch Jobs Summary:")
    display(batch_df)
else:
    print("\n💡 No batch jobs enqueued - Redis required for job queuing")

print(f"\n📊 Job Queue Monitoring:")
print(f"   • Queue name: scraping")
print(f"   • Recommended workers: 2-4 concurrent workers")
print(f"   • Estimated processing time: 5-15 minutes per job")
print(f"   • Memory usage: ~100-500MB per worker")
print(f"   • Rate limiting: Built-in delays to respect website policies")

## 5. Scheduled Data Collection

In [None]:
# Set up scheduled scraping jobs
print("📅 Scheduled Data Collection")

# Define scraping schedules
schedules = [
    {
        "name": "Hourly Breaking News",
        "cron": "0 * * * *",  # Every hour
        "description": "Quick scan for breaking news",
        "config": {
            "max_articles": 20,
            "categories": ["breaking", "urgent"],
            "priority": "high",
            "quick_mode": True
        }
    },
    {
        "name": "Daily Tech News",
        "cron": "0 8 * * *",  # Daily at 8 AM
        "description": "Comprehensive technology news collection",
        "config": {
            "max_articles": 100,
            "categories": ["technology", "ai", "software"],
            "extract_keywords": True,
            "sentiment_analysis": True
        }
    },
    {
        "name": "Weekly Deep Dive",
        "cron": "0 9 * * 1",  # Weekly on Monday at 9 AM
        "description": "Comprehensive multi-category collection",
        "config": {
            "max_articles": 500,
            "categories": ["technology", "business", "science", "health"],
            "extract_keywords": True,
            "sentiment_analysis": True,
            "language_detection": True,
            "content_classification": True
        }
    },
    {
        "name": "Market Opening Scan",
        "cron": "30 9 * * 1-5",  # Weekdays at 9:30 AM
        "description": "Business and market news before trading",
        "config": {
            "max_articles": 50,
            "categories": ["business", "finance", "market"],
            "sentiment_analysis": True,
            "priority": "high"
        }
    }
]

scheduled_jobs = []

for schedule in schedules:
    print(f"\n📋 {schedule['name']}")
    print(f"   ⏰ Schedule: {schedule['description']}")
    print(f"   🔧 Cron: {schedule['cron']}")
    print(f"   📊 Target articles: {schedule['config'].get('max_articles', 'unlimited')}")
    
    try:
        # Add scrape timestamp to config
        config = schedule['config'].copy()
        config['scrape_timestamp'] = datetime.now().isoformat()
        
        job = project.pipeline_manager.schedule(
            "news_scraper",
            cron=schedule['cron'],
            inputs=config,
            final_vars=["processed_articles"],
            queue_name="scraping",
            job_id=f"scheduled_{schedule['name'].lower().replace(' ', '_')}"
        )
        
        scheduled_jobs.append((schedule['name'], job, schedule['description']))
        print(f"   ✅ Scheduled successfully - Job ID: {job.id}")
        
    except Exception as e:
        print(f"   ❌ Scheduling failed: {e}")
        print("   💡 Requires Redis for job scheduling")

if scheduled_jobs:
    print(f"\n🎉 Successfully scheduled {len(scheduled_jobs)} scraping jobs!")
    print("\n🚀 To process scheduled jobs, start a worker with scheduler:")
    print("   flowerpower job-queue start-worker --with-scheduler")
    
    # Create schedule visualization
    schedule_df = pd.DataFrame([
        {
            "Schedule Name": name,
            "Description": desc,
            "Cron Expression": s["cron"],
            "Max Articles": s["config"].get("max_articles", "unlimited")
        }
        for (name, job, desc), s in zip(scheduled_jobs, schedules)
    ])
    
    print("\n📅 Scheduled Jobs Summary:")
    display(schedule_df)
    
    # Visualize schedule frequency
    schedule_types = ['Hourly', 'Daily', 'Weekly', 'Weekdays']
    frequencies = [24, 1, 1/7, 5]  # executions per day
    
    plt.figure(figsize=(10, 6))
    plt.bar(schedule_types, frequencies, color=['lightblue', 'lightgreen', 'lightcoral', 'lightyellow'])
    plt.title('Scheduled Scraping Frequency')
    plt.xlabel('Schedule Type')
    plt.ylabel('Executions per Day')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    for i, v in enumerate(frequencies):
        plt.text(i, v, f'{v:.1f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("\n💡 No schedules created - Redis required for scheduling functionality")

# Data collection estimates
print(f"\n📈 Data Collection Estimates:")
print(f"   • Hourly: ~20 articles = 480 articles/day")
print(f"   • Daily: ~100 articles = 100 articles/day")
print(f"   • Weekly: ~500 articles = 71 articles/day")
print(f"   • Weekdays: ~50 articles = 250 articles/day")
print(f"   • Total estimated: ~900 articles/day")
print(f"   • Monthly volume: ~27,000 articles")
print(f"   • Storage needed: ~50-100GB/month (with content)")

## 6. Data Export and Integration

In [None]:
# Generate comprehensive data export
print("📤 Data Export and Integration")

# Run scraping with comprehensive export options
export_result = project.pipeline_manager.run(
    "news_scraper",
    inputs={
        "scrape_timestamp": datetime.now().isoformat(),
        "export_formats": ["csv", "json", "parquet"],
        "include_metadata": True,
        "extract_keywords": True,
        "sentiment_analysis": True,
        "compress_output": True
    },
    final_vars=[
        "processed_articles",
        "export_summary",
        "data_quality_report"
    ]
)

if "processed_articles" in export_result:
    articles = export_result["processed_articles"]
    print(f"\n📊 Export Results:")
    print(f"   • Articles exported: {articles['total_articles']}")
    print(f"   • Output file: {articles['output_file']}")
    print(f"   • File size: {articles.get('file_size_mb', 'N/A')} MB")
    print(f"   • Compression ratio: {articles.get('compression_ratio', 'N/A')}")

if "export_summary" in export_result:
    summary = export_result["export_summary"]
    print(f"\n📋 Export Summary:")
    print(f"   • Formats generated: {summary['formats_created']}")
    print(f"   • Total files: {summary['total_files']}")
    print(f"   • Total size: {summary['total_size_mb']:.2f} MB")
    
    # Show file details
    if 'file_details' in summary:
        print(f"\n📁 Generated Files:")
        for file_info in summary['file_details']:
            print(f"   • {file_info['format']}: {file_info['filename']} ({file_info['size_mb']:.1f} MB)")

if "data_quality_report" in export_result:
    quality = export_result["data_quality_report"]
    print(f"\n✅ Data Quality Report:")
    print(f"   • Completeness score: {quality['completeness_score']:.1f}%")
    print(f"   • Accuracy score: {quality['accuracy_score']:.1f}%")
    print(f"   • Duplicate articles: {quality['duplicate_count']}")
    print(f"   • Missing content: {quality['missing_content_count']}")
    print(f"   • Quality grade: {quality['overall_grade']}")

# Create sample analysis of exported data
print(f"\n🔍 Sample Data Analysis")

# Simulate analysis of exported data
sample_data = {
    'articles': [
        {
            'title': 'AI Revolution in Healthcare',
            'source': 'TechNews',
            'sentiment': 0.7,
            'keywords': ['AI', 'healthcare', 'innovation'],
            'word_count': 850
        },
        {
            'title': 'Market Volatility Continues',
            'source': 'FinanceDaily',
            'sentiment': -0.3,
            'keywords': ['market', 'volatility', 'economy'],
            'word_count': 650
        },
        {
            'title': 'Breakthrough in Quantum Computing',
            'source': 'ScienceToday',
            'sentiment': 0.8,
            'keywords': ['quantum', 'computing', 'breakthrough'],
            'word_count': 1200
        }
    ]
}

# Convert to DataFrame for analysis
sample_df = pd.DataFrame(sample_data['articles'])

print("\n📊 Sample Exported Data:")
display(sample_df)

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Sentiment distribution
sentiments = sample_df['sentiment']
axes[0, 0].hist(sentiments, bins=10, alpha=0.7, color='lightblue')
axes[0, 0].set_title('Sentiment Distribution')
axes[0, 0].set_xlabel('Sentiment Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)

# Word count distribution
word_counts = sample_df['word_count']
axes[0, 1].hist(word_counts, bins=8, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Word Count Distribution')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')

# Articles by source
source_counts = sample_df['source'].value_counts()
axes[1, 0].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
axes[1, 0].set_title('Articles by Source')

# Keyword frequency (flattened)
all_keywords = [kw for keywords in sample_df['keywords'] for kw in keywords]
keyword_counts = Counter(all_keywords)
top_keywords = dict(keyword_counts.most_common(6))

axes[1, 1].bar(top_keywords.keys(), top_keywords.values(), color='orange', alpha=0.7)
axes[1, 1].set_title('Top Keywords')
axes[1, 1].set_xlabel('Keywords')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Save export summary
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
export_summary_data = {
    "export_timestamp": datetime.now().isoformat(),
    "total_articles": export_result.get('processed_articles', {}).get('total_articles', 0),
    "formats_exported": ['csv', 'json', 'parquet'],
    "data_quality_score": export_result.get('data_quality_report', {}).get('completeness_score', 0)
}

summary_file = f"outputs/scraping_export_summary_{timestamp}.json"

try:
    os.makedirs("outputs", exist_ok=True)
    with open(summary_file, 'w') as f:
        json.dump(export_summary_data, f, indent=2)
    print(f"\n💾 Export summary saved: {summary_file}")
except Exception as e:
    print(f"\n⚠️ Could not save export summary: {e}")

print(f"\n🔗 Integration Options:")
print(f"   • Database: Load into PostgreSQL, MySQL, or MongoDB")
print(f"   • Analytics: Import into Tableau, Power BI, or Jupyter")
print(f"   • Search: Index in Elasticsearch or Solr")
print(f"   • API: Serve via REST API or GraphQL")
print(f"   • ML Pipeline: Feed into machine learning models")
print(f"   • Alerting: Set up keyword-based notifications")

print(f"\n🎉 Web scraping pipeline completed successfully!")
print(f"📰 Data ready for analysis and downstream processing")