In [None]:
# Project Configuration & Paths
# This cell imports our centralized configuration

import sys
from pathlib import Path

# Add the project root to the Python path so we can import our config
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import our centralized configuration
import config

# All paths and settings are now available via config module
print("📁 Project paths loaded from config.py:")
print(f"  Raw data: {config.RAW_DATA_DIR}")
print(f"  Processed data: {config.PROCESSED_DATA_DIR}")
print(f"  Figures: {config.FIGURES_DIR}")
print()
print("⚙️ Settings loaded from environment (.env file):")
print(f"  Random seed: {config.RANDOM_SEED}")
print(f"  Figure DPI: {config.DPI}")
print()
print("💡 To customize these settings:")
print("  1. Copy .env.example to .env")
print("  2. Edit .env with your preferred values")
print("  3. Restart the kernel to reload")

# Exploratory Data Analysis

## Project Overview
Brief description of the project and its objectives.

## Data Description
- **Source**: [Data source]
- **Size**: [Number of rows and columns]
- **Date Range**: [If applicable]
- **Key Variables**: [List main variables of interest]

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Import our utility functions (both data analysis and journalism)
from utils import (
    quick_info, plot_distributions, correlation_analysis,
    quick_export_for_web, create_story_charts, data_fact_check,
    quick_summary_table, compare_periods
)

print("📚 All libraries and utilities loaded!")
print("💡 New journalism utilities available:")
print("   • quick_export_for_web() - Export data for publishing")
print("   • create_story_charts() - Publication-ready charts")
print("   • data_fact_check() - Quick data validation")
print("   • quick_summary_table() - Summary tables by category")
print("   • compare_periods() - Before/after analysis")

In [None]:
# Load your data
# Replace this with your actual data loading

# Example: Load from CSV
# df = pd.read_csv(config.RAW_DATA_DIR / "your_data.csv")

# Example: Load from multiple sources (common in journalism)
# df1 = pd.read_csv(config.RAW_DATA_DIR / "source1.csv")
# df2 = pd.read_excel(config.RAW_DATA_DIR / "source2.xlsx")
# df = pd.merge(df1, df2, on='common_column')

# For demonstration, let's create sample data
np.random.seed(42)
sample_data = {
    'state': np.random.choice(['California', 'Texas', 'New York', 'Florida', 'Illinois'], 1000),
    'population': np.random.randint(10000, 50000000, 1000),
    'incidents': np.random.randint(0, 500, 1000),
    'date': pd.date_range('2024-01-01', periods=1000, freq='D'),
    'category': np.random.choice(['Type A', 'Type B', 'Type C'], 1000)
}

df = pd.DataFrame(sample_data)
print(f"✅ Data loaded: {len(df):,} rows, {len(df.columns)} columns")

# Quick journalism fact-check
data_fact_check(df)

In [None]:
# Data overview
print("=== DATA OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMemory usage:")
print(df.memory_usage(deep=True))

print("\n=== BASIC STATISTICS ===")
df.describe()

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = 100 * missing_data / len(df)
missing_table = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
missing_table[missing_table['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

In [None]:
# Quick publication-ready visualizations
print("📊 Creating story charts...")

# Chart 1: Distribution by state (automatically saves as publication-ready)
create_story_charts(df, 'state', title='Incidents by State', save_filename='incidents_by_state')

# Chart 2: Time series analysis 
create_story_charts(df, 'incidents', chart_type='histogram', 
                   title='Distribution of Incident Counts', save_filename='incident_distribution')

In [None]:
# Journalism workflow examples

# 1. Quick summary table for publication
print("📋 Summary table by state:")
state_summary = quick_summary_table(df, 'state', ['incidents', 'population'], 'sum')

# 2. Export data for web publishing
print("\n💾 Exporting data for publication:")
quick_export_for_web(state_summary, 'state_summary', 'csv')
quick_export_for_web(state_summary, 'state_summary_web', 'html')

# 3. Quick fact-check on specific column
print("\n🔍 Fact-checking incidents data:")
incident_check = data_fact_check(df, 'incidents')

## Key Insights

1. **Data Quality**: [Observations about missing values, outliers, etc.]
2. **Distributions**: [Key findings about variable distributions]
3. **Relationships**: [Notable correlations or patterns]
4. **Next Steps**: [Recommendations for further analysis or modeling]

In [None]:
# Period comparison analysis (useful for before/after stories)
print("📈 Comparing periods (before vs after analysis):")

# Compare incidents before and after mid-year
comparison = compare_periods(df, 'date', 'incidents', '2024-01-01', '2024-07-01')

# Summary statistics using traditional utilities
print("\n📊 Traditional analysis:")
quick_info(df)

print("\n🎯 Analysis complete! Key files generated:")
print("   • Charts saved in notebooks/ directory")
print("   • Data exports in data/processed/ directory") 
print("   • Ready for publication workflow")