# 02 - Exploratory Data Analysis (EDA)

This notebook provides comprehensive exploratory analysis of the Kickstarter dataset:
- Data overview and summary statistics
- Distribution analysis
- Relationship between variables
- Identifying patterns for causal inference

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure plots
sns.set_theme(style='darkgrid', palette='husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

%matplotlib inline

## 1. Load Data

In [None]:
# Try to load processed data, fall back to raw
data_path = Path('../data')

if (data_path / 'processed' / 'kickstarter_processed.csv').exists():
    df = pd.read_csv(data_path / 'processed' / 'kickstarter_processed.csv')
    print("Loaded processed data")
elif (data_path / 'raw' / 'kickstarter_raw.csv').exists():
    df = pd.read_csv(data_path / 'raw' / 'kickstarter_raw.csv')
    print("Loaded raw data")
else:
    df = pd.read_csv(data_path / 'raw' / 'kickstarter_raw_data.csv')
    print("Loaded kickstarter_raw_data.csv")

print(f"Dataset shape: {df.shape}")

## 2. Data Overview

In [None]:
# First few rows
df.head()

In [None]:
# Data types and memory
print("Column Types:")
print(df.dtypes)
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)

In [None]:
# Summary statistics
df.describe()

## 3. Campaign Status Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Status distribution
status_col = 'status' if 'status' in df.columns else 'is_successful'
status_counts = df[status_col].value_counts()

colors = ['#00D4AA', '#EF4444'] if status_col == 'is_successful' else ['#00D4AA', '#EF4444']
axes[0].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', 
            colors=colors, explode=[0.02, 0.02][:len(status_counts)])
axes[0].set_title('Campaign Status Distribution', fontsize=14)

# Success rate by category
if 'category' in df.columns and 'is_successful' in df.columns:
    success_by_cat = df.groupby('category')['is_successful'].mean().sort_values()
    success_by_cat.plot(kind='barh', ax=axes[1], color='steelblue')
    axes[1].set_xlabel('Success Rate')
    axes[1].set_title('Success Rate by Category')
    axes[1].axvline(x=df['is_successful'].mean(), color='red', linestyle='--', label='Overall')
    axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Funding Analysis

In [None]:
goal_col = 'funding_goal' if 'funding_goal' in df.columns else 'goal'
pledged_col = 'pledged_amount' if 'pledged_amount' in df.columns else 'pledged'

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Goal distribution (log scale)
df[goal_col].apply(np.log10).hist(bins=40, ax=axes[0,0], color='coral', edgecolor='black', alpha=0.7)
axes[0,0].set_title('Funding Goal Distribution (log10)', fontsize=12)
axes[0,0].set_xlabel('Log10(Goal)')
axes[0,0].set_ylabel('Count')

# Pledged distribution (log scale)
df[pledged_col].replace(0, 1).apply(np.log10).hist(bins=40, ax=axes[0,1], color='teal', edgecolor='black', alpha=0.7)
axes[0,1].set_title('Pledged Amount Distribution (log10)', fontsize=12)
axes[0,1].set_xlabel('Log10(Pledged)')

# Funding ratio distribution
if 'funding_ratio' in df.columns:
    df['funding_ratio'].clip(upper=5).hist(bins=40, ax=axes[1,0], color='purple', edgecolor='black', alpha=0.7)
    axes[1,0].axvline(x=1, color='red', linestyle='--', linewidth=2, label='100% Funded')
    axes[1,0].set_title('Funding Ratio Distribution (capped at 5x)', fontsize=12)
    axes[1,0].legend()

# Goal vs Pledged scatter
sample = df.sample(min(500, len(df)))
colors = sample['is_successful'] if 'is_successful' in df.columns else 'steelblue'
axes[1,1].scatter(sample[goal_col], sample[pledged_col], c=colors, alpha=0.5, cmap='RdYlGn')
axes[1,1].plot([0, df[goal_col].max()], [0, df[goal_col].max()], 'r--', label='100% Funded')
axes[1,1].set_xlabel('Funding Goal ($)')
axes[1,1].set_ylabel('Pledged Amount ($)')
axes[1,1].set_title('Goal vs Pledged')
axes[1,1].legend()
axes[1,1].set_xscale('log')
axes[1,1].set_yscale('log')

plt.tight_layout()
plt.show()

## 5. Category Analysis

In [None]:
if 'category' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Campaign counts by category
    cat_counts = df['category'].value_counts()
    cat_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
    axes[0].set_title('Campaigns by Category')
    axes[0].set_xlabel('Category')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Average funding by category
    avg_funding = df.groupby('category')[pledged_col].mean().sort_values()
    avg_funding.plot(kind='barh', ax=axes[1], color='coral')
    axes[1].set_title('Average Pledged Amount by Category')
    axes[1].set_xlabel('Average Pledged ($)')
    
    plt.tight_layout()
    plt.show()

## 6. Time Analysis

In [None]:
if 'launch_date' in df.columns:
    df['launch_date'] = pd.to_datetime(df['launch_date'])
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Campaigns over time
    df.set_index('launch_date').resample('M').size().plot(ax=axes[0,0], color='steelblue')
    axes[0,0].set_title('Campaigns Launched Over Time')
    axes[0,0].set_ylabel('Count')
    
    # Success rate over time
    if 'is_successful' in df.columns:
        df.set_index('launch_date').resample('M')['is_successful'].mean().plot(ax=axes[0,1], color='green')
        axes[0,1].set_title('Success Rate Over Time')
        axes[0,1].set_ylabel('Success Rate')
    
    # By day of week
    if 'day_of_week' in df.columns:
        day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        success_by_dow = df.groupby('day_of_week')['is_successful'].mean()
        success_by_dow.index = [day_names[i] for i in success_by_dow.index]
        success_by_dow.plot(kind='bar', ax=axes[1,0], color='teal')
        axes[1,0].set_title('Success Rate by Day of Week')
        axes[1,0].axhline(y=df['is_successful'].mean(), color='red', linestyle='--')
    
    # By month
    if 'month' in df.columns:
        success_by_month = df.groupby('month')['is_successful'].mean()
        success_by_month.plot(kind='bar', ax=axes[1,1], color='purple')
        axes[1,1].set_title('Success Rate by Month')
        axes[1,1].axhline(y=df['is_successful'].mean(), color='red', linestyle='--')
    
    plt.tight_layout()
    plt.show()

## 7. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
key_cols = [c for c in numeric_cols if c in [
    'funding_goal', 'goal', 'pledged_amount', 'pledged', 'backers_count',
    'funding_ratio', 'avg_reward_price', 'num_updates', 'num_comments',
    'campaign_duration_days', 'duration_days', 'is_successful'
]]

if len(key_cols) > 2:
    fig, ax = plt.subplots(figsize=(12, 10))
    corr_matrix = df[key_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, 
                fmt='.2f', square=True, ax=ax)
    ax.set_title('Correlation Matrix', fontsize=14)
    plt.tight_layout()
    plt.show()

## 8. Key Insights

In [None]:
print("="*60)
print("KEY INSIGHTS")
print("="*60)

# Success rate
if 'is_successful' in df.columns:
    print(f"\nOverall Success Rate: {df['is_successful'].mean()*100:.1f}%")

# Goal statistics
print(f"\nFunding Goal:")
print(f"  Median: ${df[goal_col].median():,.0f}")
print(f"  Mean: ${df[goal_col].mean():,.0f}")

# Funding ratio
if 'funding_ratio' in df.columns:
    print(f"\nFunding Ratio:")
    print(f"  Median: {df['funding_ratio'].median():.2f}")
    print(f"  Campaigns >300%: {(df['funding_ratio'] > 3).sum()} ({(df['funding_ratio'] > 3).mean()*100:.1f}%)")

# Best performing categories
if 'category' in df.columns and 'is_successful' in df.columns:
    best_cat = df.groupby('category')['is_successful'].mean().idxmax()
    best_rate = df.groupby('category')['is_successful'].mean().max()
    print(f"\nBest Category: {best_cat} ({best_rate*100:.1f}% success rate)")