# Agentic Local SEO Content Factory - Exploratory Data Analysis

This notebook provides initial exploration and analysis of business data for the Local SEO Content Factory.

## Objectives
- Understand the structure and quality of business data
- Identify patterns for content generation
- Validate data completeness for SEO page creation
- Generate insights for content strategy

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("📊 Agentic Local SEO Content Factory - EDA")
print("🚀 Analyzing business data for content generation opportunities...")

# Load sample business data
try:
    df = pd.read_csv('../data/sample_businesses.csv')
    print(f"✅ Loaded {len(df)} business records")
except FileNotFoundError:
    print("⚠️  Sample data file not found. Creating synthetic data for demonstration...")
    # Create sample data if file doesn't exist
    df = pd.DataFrame({
        'business_id': [f'biz_{i:03d}' for i in range(1, 26)],
        'name': ['Sample Business ' + str(i) for i in range(1, 26)],
        'category': ['Restaurant', 'Automotive', 'Healthcare', 'Retail', 'Professional Services'] * 5,
        'city': ['Los Angeles', 'San Francisco', 'San Diego', 'Sacramento', 'Oakland'] * 5,
        'state': ['CA'] * 25,
        'rating': np.random.uniform(3.0, 5.0, 25),
        'review_count': np.random.randint(10, 300, 25)
    })
    print(f"✅ Created {len(df)} synthetic business records")

# Display basic info
print(f"\n📈 Dataset Overview:")
print(f"   • Total businesses: {len(df)}")
print(f"   • Columns: {len(df.columns)}")
print(f"   • Memory usage: {df.memory_usage().sum() / 1024:.1f} KB")

df.head()

## Data Quality Assessment

Let's analyze the completeness and quality of our business data to understand what's available for content generation.

In [None]:
# Data completeness analysis
print("🔍 Data Completeness Analysis")
print("=" * 40)

completeness = (df.notna().sum() / len(df) * 100).round(2)
completeness_df = pd.DataFrame({
    'Field': completeness.index,
    'Completeness %': completeness.values,
    'Missing Count': df.isna().sum().values
})

# Categorize fields by completeness
high_complete = completeness_df[completeness_df['Completeness %'] >= 90]
medium_complete = completeness_df[(completeness_df['Completeness %'] >= 70) & (completeness_df['Completeness %'] < 90)]
low_complete = completeness_df[completeness_df['Completeness %'] < 70]

print(f"✅ Highly Complete Fields (≥90%): {len(high_complete)}")
for _, row in high_complete.iterrows():
    print(f"   • {row['Field']}: {row['Completeness %']}%")

if len(medium_complete) > 0:
    print(f"\n⚠️  Medium Complete Fields (70-89%): {len(medium_complete)}")
    for _, row in medium_complete.iterrows():
        print(f"   • {row['Field']}: {row['Completeness %']}%")

if len(low_complete) > 0:
    print(f"\n❌ Low Complete Fields (<70%): {len(low_complete)}")
    for _, row in low_complete.iterrows():
        print(f"   • {row['Field']}: {row['Completeness %']}%")

# Content generation readiness score
essential_fields = ['business_id', 'name', 'category', 'city', 'state']
content_ready = df[essential_fields].notna().all(axis=1).sum()
print(f"\n🎯 Content Generation Ready: {content_ready}/{len(df)} businesses ({content_ready/len(df)*100:.1f}%)")

completeness_df

## Geographic and Category Distribution

Understanding the distribution of businesses helps us plan content generation strategies and identify market opportunities.

In [None]:
# Create visualizations for business distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('🗺️ Business Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Businesses by Category
if 'category' in df.columns:
    category_counts = df['category'].value_counts()
    category_counts.plot(kind='bar', ax=axes[0,0], color='skyblue', edgecolor='black')
    axes[0,0].set_title('📊 Businesses by Category')
    axes[0,0].set_xlabel('Category')
    axes[0,0].set_ylabel('Count')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for i, v in enumerate(category_counts.values):
        axes[0,0].text(i, v + 0.1, str(v), ha='center', va='bottom')

# 2. Businesses by City
if 'city' in df.columns:
    city_counts = df['city'].value_counts().head(10)  # Top 10 cities
    city_counts.plot(kind='barh', ax=axes[0,1], color='lightcoral')
    axes[0,1].set_title('🏙️ Top 10 Cities by Business Count')
    axes[0,1].set_xlabel('Count')
    axes[0,1].set_ylabel('City')

# 3. Rating Distribution
if 'rating' in df.columns and df['rating'].notna().sum() > 0:
    df['rating'].hist(bins=20, ax=axes[1,0], color='lightgreen', edgecolor='black', alpha=0.7)
    axes[1,0].set_title('⭐ Rating Distribution')
    axes[1,0].set_xlabel('Rating')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].axvline(df['rating'].mean(), color='red', linestyle='--', 
                     label=f'Mean: {df["rating"].mean():.2f}')
    axes[1,0].legend()

# 4. Review Count Distribution (log scale)
if 'review_count' in df.columns and df['review_count'].notna().sum() > 0:
    review_data = df['review_count'].dropna()
    if len(review_data) > 0:
        axes[1,1].hist(review_data, bins=15, color='gold', edgecolor='black', alpha=0.7)
        axes[1,1].set_title('📝 Review Count Distribution')
        axes[1,1].set_xlabel('Review Count')
        axes[1,1].set_ylabel('Frequency')
        axes[1,1].axvline(review_data.mean(), color='red', linestyle='--', 
                         label=f'Mean: {review_data.mean():.0f}')
        axes[1,1].legend()

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n📊 Distribution Summary:")
print(f"   • Unique Categories: {df['category'].nunique() if 'category' in df.columns else 'N/A'}")
print(f"   • Unique Cities: {df['city'].nunique() if 'city' in df.columns else 'N/A'}")
if 'rating' in df.columns and df['rating'].notna().sum() > 0:
    print(f"   • Average Rating: {df['rating'].mean():.2f}")
    print(f"   • Rating Range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
if 'review_count' in df.columns and df['review_count'].notna().sum() > 0:
    print(f"   • Average Reviews: {df['review_count'].mean():.0f}")
    print(f"   • Review Range: {df['review_count'].min():.0f} - {df['review_count'].max():.0f}")

## Content Generation Strategy

Based on the data analysis, let's develop insights for our content generation strategy.

In [None]:
print("🎯 Content Generation Strategy Analysis")
print("=" * 50)

# Priority scoring for content generation
def calculate_priority_score(row):
    score = 0
    
    # Completeness score (40 points)
    essential_fields = ['name', 'category', 'city', 'state']
    if all(pd.notna(row.get(field)) for field in essential_fields):
        score += 20
    
    optional_fields = ['address', 'zip_code', 'phone', 'website', 'email']
    score += sum(5 for field in optional_fields if pd.notna(row.get(field))) * 0.8
    
    # Quality indicators (30 points)
    if pd.notna(row.get('rating')):
        if row['rating'] >= 4.5:
            score += 15
        elif row['rating'] >= 4.0:
            score += 10
        elif row['rating'] >= 3.5:
            score += 5
    
    if pd.notna(row.get('review_count')):
        if row['review_count'] >= 100:
            score += 15
        elif row['review_count'] >= 50:
            score += 10
        elif row['review_count'] >= 20:
            score += 5
    
    # SEO potential (30 points)
    if pd.notna(row.get('website')):
        score += 15
    if pd.notna(row.get('description')) and len(str(row.get('description', ''))) > 50:
        score += 15
    
    return min(score, 100)  # Cap at 100

# Calculate priority scores
df['priority_score'] = df.apply(calculate_priority_score, axis=1)

# Categorize by priority
df['priority_category'] = pd.cut(df['priority_score'], 
                               bins=[0, 40, 70, 85, 100], 
                               labels=['Low', 'Medium', 'High', 'Premium'])

priority_dist = df['priority_category'].value_counts()
print("\n📈 Content Generation Priority Distribution:")
for category, count in priority_dist.items():
    percentage = (count / len(df)) * 100
    print(f"   • {category}: {count} businesses ({percentage:.1f}%)")

# Top candidates for content generation
top_candidates = df.nlargest(10, 'priority_score')[['name', 'category', 'city', 'priority_score']]
print("\n🏆 Top 10 Content Generation Candidates:")
for idx, (_, row) in enumerate(top_candidates.iterrows(), 1):
    print(f"   {idx:2d}. {row['name']} ({row['category']}) - {row['city']} [Score: {row['priority_score']:.1f}]")

# SEO keyword opportunities
print("\n🔑 SEO Keyword Opportunities:")
if 'category' in df.columns and 'city' in df.columns:
    # Generate city + category combinations
    combos = df.groupby(['city', 'category']).size().reset_index(name='count')
    combos['keyword'] = combos['category'] + ' in ' + combos['city']
    
    # Find unique opportunities (only 1 business in category/city)
    unique_opportunities = combos[combos['count'] == 1]['keyword'].tolist()
    print(f"   • Unique Market Opportunities: {len(unique_opportunities)}")
    if len(unique_opportunities) > 0:
        print(f"     Examples: {', '.join(unique_opportunities[:5])}")
    
    # High competition areas
    competitive = combos[combos['count'] >= 3].sort_values('count', ascending=False)
    print(f"   • Competitive Markets: {len(competitive)}")
    if len(competitive) > 0:
        print(f"     Examples: {', '.join(competitive['keyword'].head(3).tolist())}")

# Content complexity estimation
print("\n📝 Content Generation Complexity:")
simple_count = len(df[(df.get('description', '').notna()) & (df['description'].str.len() > 100)])
moderate_count = len(df) - simple_count

print(f"   • Simple Generation (has description): {simple_count} businesses")
print(f"   • Moderate Generation (needs research): {moderate_count} businesses")
print(f"   • Estimated total content: {len(df) * 1000:,} words (avg 1000 words/page)")

# Display priority score visualization
plt.figure(figsize=(10, 6))
plt.hist(df['priority_score'], bins=20, color='lightblue', edgecolor='black', alpha=0.7)
plt.title('📊 Content Generation Priority Scores Distribution')
plt.xlabel('Priority Score')
plt.ylabel('Number of Businesses')
plt.axvline(df['priority_score'].mean(), color='red', linestyle='--', 
           label=f'Mean: {df["priority_score"].mean():.1f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\n✅ Analysis Complete! Ready to generate content for {len(df)} businesses.")
print(f"🚀 Recommended batch size for initial deployment: {min(len(df[df['priority_score'] >= 70]), 25)} businesses")

## Next Steps

Based on this analysis, here are the recommended next steps:

1. **High Priority**: Focus on businesses with priority scores ≥ 70
2. **Content Strategy**: Start with unique market opportunities for easier ranking
3. **Quality Control**: Implement validation for businesses with incomplete data
4. **Batch Processing**: Process 25-50 businesses per batch for quality management
5. **SEO Focus**: Target long-tail keywords like "[Category] in [City]"

The data is now ready for the Agentic Local SEO Content Factory pipeline!