# 01 - Data Collection

This notebook covers data collection for Kickstarter campaigns using either:
1. Web scraping (for real data)
2. Synthetic data generation (for development)

## Objectives
- Collect 500-1000 completed campaigns
- Extract key features: funding goal, pledged amount, backers, reward tiers
- Handle missing data gracefully

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scraping import generate_synthetic_kickstarter_data, KickstarterScraper

sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Option 1: Generate Synthetic Data (Development)

In [None]:
# Generate synthetic data with realistic causal structure
df = generate_synthetic_kickstarter_data(num_campaigns=600)
print(f"Generated {len(df)} campaigns")
df.head()

## Option 2: Scrape Real Data (Production)

**Note:** Real scraping requires careful rate limiting and may be affected by website changes.

In [None]:
# Uncomment to use real scraper
# scraper = KickstarterScraper(rate_limit=2.0)
# df = scraper.scrape_category('technology', num_projects=500)

## Data Overview

In [None]:
print("Dataset Shape:", df.shape)
print("\nColumn Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Summary statistics
df.describe()

## Category Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Category counts
df['category'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Campaigns by Category')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Success rate by category
success_rates = df.groupby('category')['is_successful'].mean().sort_values()
success_rates.plot(kind='barh', ax=axes[1], color='forestgreen')
axes[1].set_title('Success Rate by Category')
axes[1].set_xlabel('Success Rate')
axes[1].axvline(x=df['is_successful'].mean(), color='red', linestyle='--', label='Overall')
axes[1].legend()

plt.tight_layout()
plt.show()

## Funding Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Goal distribution (log scale)
df['goal'].apply(np.log10).hist(bins=30, ax=axes[0], color='coral', edgecolor='black')
axes[0].set_title('Funding Goal Distribution (log10)')
axes[0].set_xlabel('Log10(Goal)')
axes[0].set_ylabel('Count')

# Funding ratio distribution
df['funding_ratio'].clip(upper=5).hist(bins=30, ax=axes[1], color='teal', edgecolor='black')
axes[1].axvline(x=1, color='red', linestyle='--', label='100% Funded')
axes[1].set_title('Funding Ratio Distribution')
axes[1].set_xlabel('Funding Ratio (capped at 5x)')
axes[1].legend()

plt.tight_layout()
plt.show()

## Save Data

In [None]:
# Save to raw data folder
output_path = '../data/raw/kickstarter_raw_data.csv'
df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")