# Global Protest Tracker - Analysis

This notebook loads the dataset, performs small ETL steps, and produces summary plots.

In [3]:
from pathlib import Path
import sys
# Ensure repository root is on sys.path so `src` imports work
repo_root = Path('..').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

import src.etl as etl
import src.visualize as viz
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

DATA_DIR = Path('../data/raw')
OUT_DIR = Path('../outputs')
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Load and prepare data
df = etl.load_primary_csv(DATA_DIR)
df = etl.add_columns(df)

# Parse a human-readable Duration column into days when possible
def parse_duration(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    # If value like 'Active' or contains 'Active', return None
    if re.search(r'Active', s, flags=re.I):
        return None
    # Look for explicit patterns
    m = re.search(r"(\d+)\s*day", s, flags=re.I)
    if m:
        return int(m.group(1))
    m = re.search(r"(\d+)\s*week", s, flags=re.I)
    if m:
        return int(m.group(1)) * 7
    m = re.search(r"(\d+)\s*month", s, flags=re.I)
    if m:
        return int(m.group(1)) * 30
    m = re.search(r"(\d+)\s*year", s, flags=re.I)
    if m:
        return int(m.group(1)) * 365
    # If string looks like '6 months' or '2 months, intermittently'
    m = re.search(r"(\d+)", s)
    if m:
        return int(m.group(1))
    return None

if 'Duration' in df.columns:
    df['Duration_days'] = df['Duration'].apply(parse_duration)
else:
    # fallback to existing duration_days
    df['Duration_days'] = df.get('duration_days')

# Keep compatibility with visualization helper
if 'duration_days' not in df.columns and 'Duration_days' in df.columns:
    df['duration_days'] = df['Duration_days']

# Duration histogram
viz.plot_duration_hist(df, out_path=OUT_DIR / 'duration_hist.png')

# Outcomes counts plot
if 'Outcomes' in df.columns:
    counts = df['Outcomes'].fillna('Unknown').value_counts()
    plt.figure(figsize=(10, max(4, 0.25 * len(counts))))
    sns.barplot(x=counts.values, y=counts.index)
    plt.title('Counts by Outcomes')
    plt.xlabel('Number of events')
    out_path = OUT_DIR / 'outcomes_counts.png'
    out_path.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(out_path, bbox_inches='tight')
    plt.close()

print('Rows:', len(df))
print('Saved plots to', OUT_DIR)


Rows: 329
Saved plots to ../outputs


In [None]:
# Cell: Quick summary and sample rows
print('Columns:', df.columns.tolist())
print('\nData types:\n', df.dtypes)
print('\nMissingness (per column):\n', df.isna().sum().sort_values(ascending=False).head(20))
print('\nSample rows:')
print(df.head(5))

'try':
# Cell: Missingness heatmap
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(df.isna(), cbar=False)
plt.title('Missingness map')
plt.savefig(OUT_DIR / 'missingness_map.png', bbox_inches='tight')
plt.close()

# Cell: Correlation heatmap for numeric columns (Spearman)
numeric_cols = ['duration_days'] + [c for c in df.columns if c.lower().find('size')!=-1 or c.lower().find('peak')!=-1]
# include any numeric-like columns
try:
    plot_cols = [c for c in numeric_cols if c in df.columns]
    if plot_cols:
        viz.plot_correlation_heatmap(df, numeric_cols=plot_cols, out_path=OUT_DIR / 'corr_spearman.png')
except Exception as e:
    print('Corr heatmap skipped:', e)

# Cell: Top outcomes bar
try:
    viz.top_outcomes_bar(df, outcomes_col='Outcomes', top_n=30, out_path=OUT_DIR / 'top_outcomes.png')
except Exception as e:
    print('Top outcomes plot skipped:', e)

# Cell: Scatter/regression - Duration vs Peak Size (if Peak Size column exists)
peak_candidates = [c for c in df.columns if 'Peak' in c or 'peak' in c or 'Size' in c or 'size' in c]
if peak_candidates:
    # try to coerce a numeric peak size
    peak_col = peak_candidates[0]
    def extract_number(s):
        try:
            if pd.isna(s):
                return None
            s = str(s).replace(',','')
            m = re.search(r"(\d+)", s)
            return int(m.group(1)) if m else None
        except Exception:
            return None
    df['_peak_num'] = df[peak_col].apply(extract_number)
    if df['_peak_num'].notna().sum() > 10:
        viz.plot_scatter_with_reg(df, x_col='_peak_num', y_col='duration_days', out_path=OUT_DIR / 'duration_vs_peak.png')

# Cell: Boxplot duration by outcome (top 10 outcomes)
try:
    viz.boxplot_duration_by_outcome(df, outcomes_col='Outcomes', top_n=10, out_path=OUT_DIR / 'boxplot_duration_outcome.png')
except Exception as e:
    print('Boxplot skipped:', e)
