In [1]:
import json
import os
from typing import List, Dict, Tuple
from dotenv import load_dotenv
from openai import OpenAI
import diff_match_patch as dmp_module

# Load API key from .env file
load_dotenv()

client = OpenAI()  # Automatically uses OPENAI_API_KEY from environment
dmp = dmp_module.diff_match_patch()  # Myers diff implementation

In [6]:
def get_myers_diff(text_t1: str, text_t2: str) -> List[Dict]:
    """
    Get Myers diff between two Wikipedia snapshots.
    Uses Google's diff-match-patch library which implements Myers algorithm.
    Returns structured change objects with context.
    """
    # Compute Myers diff
    diffs = dmp.diff_main(text_t1, text_t2)
    dmp.diff_cleanupSemantic(diffs)  # Clean up for human readability
    
    changes = []
    i = 0
    
    while i < len(diffs):
        op, text = diffs[i]
        
        # op: -1 = DELETE, 0 = EQUAL, 1 = INSERT
        if op == -1:  # Deletion
            # Check if next is an insertion (modification)
            if i + 1 < len(diffs) and diffs[i + 1][0] == 1:
                changes.append({
                    "type": "modification",
                    "old": text.strip(),
                    "new": diffs[i + 1][1].strip()
                })
                i += 1  # Skip the insertion since we paired it
            else:
                changes.append({
                    "type": "deletion",
                    "old": text.strip(),
                    "new": None
                })
        elif op == 1:  # Insertion (not paired with deletion)
            changes.append({
                "type": "addition",
                "old": None,
                "new": text.strip()
            })
        # op == 0 means equal, skip
        
        i += 1
    
    # Filter out empty changes
    changes = [c for c in changes if (c.get("old") or c.get("new"))]
    
    return changes

In [None]:
CHANGE_CLASSIFICATION_PROMPT = """You are analyzing a change between two Wikipedia article snapshots.

Classify this change into ONE of the following categories:
- FACTUAL_UPDATE: A core fact changed (e.g., CEO changed, status changed, role changed)
- NUMERIC_UPDATE: A number changed (e.g., population, revenue, date, statistics)
- ADDITION: New substantive information was added
- DELETION: Substantive information was removed
- WORDING: Same fact expressed differently (paraphrase, grammar fix, formatting)

Change type: {change_type}
Old text: {old_text}
New text: {new_text}

Respond with ONLY one of: FACTUAL_UPDATE, NUMERIC_UPDATE, ADDITION, DELETION, WORDING

Classification:"""


def classify_change(change: Dict) -> str:
    """
    Use LLM to classify a single change.
    """
    prompt = CHANGE_CLASSIFICATION_PROMPT.format(
        change_type=change["type"],
        old_text=change.get("old", "N/A"),
        new_text=change.get("new", "N/A")
    )
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=20,
        temperature=0
    )
    
    classification = response.choices[0].message.content.strip()
    
    # Validate response
    valid_classes = ["FACTUAL_UPDATE", "NUMERIC_UPDATE", "ADDITION", "DELETION", "WORDING"]
    if classification not in valid_classes:
        # Try to extract valid class from response
        for vc in valid_classes:
            if vc in classification.upper():
                return vc
        return "WORDING"  # Default fallback
    
    return classification

In [None]:
def filter_factual_changes(changes: List[Dict], batch_size: int = 10) -> List[Dict]:
    """
    Filter changes to keep only factual/semantic changes (not just wording).
    Uses LLM to classify each change.
    """
    factual_changes = []
    
    for i, change in enumerate(changes):
        # Skip very short changes (likely formatting)
        old_len = len(change.get("old", "") or "")
        new_len = len(change.get("new", "") or "")
        
        if old_len < 5 and new_len < 5:
            continue
            
        print(f"  Classifying change {i+1}/{len(changes)}...", end="\r")
        
        classification = classify_change(change)
        change["classification"] = classification
        
        # Keep all non-WORDING changes
        if classification != "WORDING":
            factual_changes.append(change)
    
    print(f"\n  Found {len(factual_changes)} factual changes out of {len(changes)} total changes")
    return factual_changes

In [None]:
def detect_factual_changes(wiki_snapshot_t1: str, wiki_snapshot_t2: str) -> List[Dict]:
    """
    Main function: Detect factual changes between two Wikipedia snapshots.
    
    Step 1: Use Myers diff to find all changes
    Step 2: Use LLM to filter for semantic/factual changes only
    
    Returns list of classified factual changes.
    """
    print("Step 1: Computing Myers diff...")
    all_changes = get_myers_diff(wiki_snapshot_t1, wiki_snapshot_t2)
    print(f"  Found {len(all_changes)} raw changes")
    
    if not all_changes:
        print("  No changes detected")
        return []
    
    print("\nStep 2: Filtering for factual changes...")
    factual_changes = filter_factual_changes(all_changes)
    
    return factual_changes


def summarize_changes(factual_changes: List[Dict]) -> Dict:
    """
    Summarize the factual changes by category.
    """
    summary = {
        "FACTUAL_UPDATE": [],
        "NUMERIC_UPDATE": [],
        "ADDITION": [],
        "DELETION": []
    }
    
    for change in factual_changes:
        classification = change.get("classification", "WORDING")
        if classification in summary:
            summary[classification].append(change)
    
    print("\n=== Change Summary ===")
    for category, items in summary.items():
        print(f"{category}: {len(items)} changes")
        
    return summary

In [5]:
# Test Myers diff on 10 entities from Wikipedia data
import glob

data_dir = "data/wikipedia"

# Get 10 entities from people domain
entities = [
    "Elon_Musk", "Sam_Altman", "Taylor_Swift", "LeBron_James", "Joe_Biden",
    "Mark_Zuckerberg", "Sundar_Pichai", "Lionel_Messi", "Donald_Trump", "Jensen_Huang"
]

# Compare 2024-01-01 vs 2025-01-01 snapshots
for entity in entities:
    path_t1 = f"{data_dir}/people/{entity}/2024-01-01.json"
    path_t2 = f"{data_dir}/people/{entity}/2025-01-01.json"
    
    try:
        with open(path_t1) as f:
            snapshot_t1 = json.load(f).get("content", "")
        with open(path_t2) as f:
            snapshot_t2 = json.load(f).get("content", "")
        
        # Run Myers diff
        changes = get_myers_diff(snapshot_t1, snapshot_t2)
        
        print(f"\n{'='*60}")
        print(f"Entity: {entity}")
        print(f"  Raw changes detected: {len(changes)}")
        
        # Show breakdown by type
        mods = sum(1 for c in changes if c["type"] == "modification")
        adds = sum(1 for c in changes if c["type"] == "addition")
        dels = sum(1 for c in changes if c["type"] == "deletion")
        print(f"  Modifications: {mods}, Additions: {adds}, Deletions: {dels}")
        
        # Show first 3 changes as examples
        if changes:
            print(f"  Sample changes:")
            for c in changes[:3]:
                old = (c.get("old") or "")[:50]
                new = (c.get("new") or "")[:50]
                print(f"    [{c['type']}] '{old}...' -> '{new}...'")
                
    except FileNotFoundError as e:
        print(f"\n{entity}: Files not found - {e}")


Entity: Elon_Musk
  Raw changes detected: 35
  Modifications: 16, Additions: 12, Deletions: 7
  Sample changes:
    [modification] 'B...' -> 'South African-born b...'
    [modification] 'o...' -> 'O...'
    [modification] 'pp-vandalism|small=yes}}
{{pp-move}}
{{Good articl...' -> 'Good article}}
{{pp-move}}
{{pp-vandalism|small=ye...'

Entity: Sam_Altman
  Raw changes detected: 10
  Modifications: 4, Additions: 4, Deletions: 2
  Sample changes:
    [addition] '...' -> '{{Short description|American entrepreneur and inve...'
    [deletion] '{{Short description|American entrepreneur and inve...' -> '...'
    [deletion] 'File:...' -> '...'

Entity: Taylor_Swift
  Raw changes detected: 48
  Modifications: 22, Additions: 18, Deletions: 8
  Sample changes:
    [modification] 'For|the album|Taylor Swift (album){{!}}''Taylor Sw...' -> 'Other uses...'
    [modification] 'October...' -> 'March...'
    [modification] '0...' -> '4...'

Entity: LeBron_James
  Raw changes detected: 83
  Modification