# Extracteur Archivportal-D - Bürgerinitiativen

Ce notebook extrait automatiquement les initiatives citoyennes depuis archivportal-d.de

**Colonnes extraites:**
- Titre de l'initiative
- Période (année ou laps de temps)
- Lieu

---

## Mode d'emploi

1. Clique sur **Exécution > Tout exécuter** (ou Ctrl+F9)
2. Attends la fin (~2-3 minutes)
3. Le fichier CSV se télécharge automatiquement

In [None]:
#@title 1. Installation des dépendances (30 secondes)
!pip install -q aiohttp beautifulsoup4 tqdm
print("Dépendances installées !")

In [None]:
#@title 2. Code du scraper
import asyncio
import aiohttp
import csv
import hashlib
import re
from dataclasses import dataclass, asdict
from typing import Optional
from urllib.parse import urljoin, quote
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# Configuration
BASE_URL = "https://www.archivportal-d.de"
SEARCH_URL = f"{BASE_URL}/objekte"
QUERY = "Bürgerinitiativen"
ROWS_PER_PAGE = 100
MAX_CONCURRENT = 20
TIMEOUT = 30

GERMAN_CITIES = {
    'berlin', 'hamburg', 'münchen', 'munich', 'köln', 'cologne', 'frankfurt',
    'stuttgart', 'düsseldorf', 'dortmund', 'essen', 'leipzig', 'bremen',
    'dresden', 'hannover', 'nürnberg', 'duisburg', 'bochum', 'wuppertal',
    'bielefeld', 'bonn', 'münster', 'karlsruhe', 'mannheim', 'augsburg',
    'wiesbaden', 'gelsenkirchen', 'mönchengladbach', 'braunschweig', 'chemnitz',
    'kiel', 'aachen', 'halle', 'magdeburg', 'freiburg', 'krefeld', 'lübeck',
    'oberhausen', 'erfurt', 'mainz', 'rostock', 'kassel', 'hagen', 'hamm',
    'saarbrücken', 'mülheim', 'potsdam', 'ludwigshafen', 'oldenburg', 'leverkusen',
    'osnabrück', 'solingen', 'heidelberg', 'herne', 'neuss', 'darmstadt',
    'paderborn', 'regensburg', 'ingolstadt', 'würzburg', 'wolfsburg', 'ulm',
    'heilbronn', 'pforzheim', 'göttingen', 'bottrop', 'trier', 'recklinghausen',
    'reutlingen', 'bremerhaven', 'koblenz', 'bergisch gladbach', 'jena',
    'remscheid', 'erlangen', 'moers', 'siegen', 'hildesheim', 'salzgitter',
    'dormagen', 'wertheim', 'aichelberg', 'wyhl', 'gorleben', 'brokdorf',
    'kalkar', 'wackersdorf', 'grohnde', 'biblis', 'neckarwestheim',
    'baden-württemberg', 'bayern', 'bavaria', 'brandenburg', 'hessen',
    'mecklenburg-vorpommern', 'niedersachsen', 'nordrhein-westfalen', 'nrw',
    'rheinland-pfalz', 'saarland', 'sachsen', 'sachsen-anhalt', 'schleswig-holstein',
    'thüringen', 'rhein-kreis', 'schwarzwald', 'eifel', 'hunsrück', 'taunus',
}

@dataclass
class Initiative:
    titre: str
    periode: str
    lieu: str
    url: str = ""
    institution: str = ""

    def to_dict(self):
        return asdict(self)

    def hash_key(self) -> str:
        key = f"{self.titre.lower().strip()}|{self.periode}|{self.lieu.lower().strip()}"
        return hashlib.md5(key.encode()).hexdigest()


class ArchivportalScraper:
    def __init__(self):
        self.session: Optional[aiohttp.ClientSession] = None
        self.results: list[Initiative] = []
        self.seen_hashes: set[str] = set()
        self.duplicates = 0
        self.errors = 0
        self.semaphore = asyncio.Semaphore(MAX_CONCURRENT)

    async def __aenter__(self):
        timeout = aiohttp.ClientTimeout(total=TIMEOUT)
        connector = aiohttp.TCPConnector(limit=MAX_CONCURRENT)
        headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; ArchivScraper/1.0)',
            'Accept': 'text/html',
            'Accept-Encoding': 'gzip, deflate',
        }
        self.session = aiohttp.ClientSession(timeout=timeout, connector=connector, headers=headers)
        return self

    async def __aexit__(self, *args):
        if self.session:
            await self.session.close()

    async def fetch(self, url: str, retries: int = 3) -> Optional[str]:
        async with self.semaphore:
            for attempt in range(retries):
                try:
                    async with self.session.get(url) as response:
                        if response.status == 200:
                            return await response.text()
                        elif response.status == 429:
                            await asyncio.sleep(2 ** attempt)
                        else:
                            self.errors += 1
                            return None
                except:
                    await asyncio.sleep(1)
            return None

    async def get_total_results(self) -> int:
        url = f"{SEARCH_URL}?lang=en&query={quote(QUERY)}&offset=0&rows=1"
        html = await self.fetch(url)
        if not html:
            return 0
        soup = BeautifulSoup(html, 'html.parser')
        match = re.search(r'of\s+([\d,]+)', soup.get_text())
        if match:
            return int(match.group(1).replace(',', ''))
        return 0

    def extract_date(self, text: str) -> str:
        if not text:
            return "Non spécifiée"
        text = text.strip()
        match = re.search(r'(\d{4})\s*[-–]\s*(\d{4})', text)
        if match:
            return f"{match.group(1)}-{match.group(2)}"
        match = re.search(r'(\d{2}\.\d{2}\.\d{4})', text)
        if match:
            return match.group(1)
        match = re.search(r'\b(19\d{2}|20[0-2]\d)\b', text)
        if match:
            return match.group(1)
        if any(nd in text.lower() for nd in ['ohne datum', 'undatiert', 's.d.']):
            return "Non datée"
        return "Non spécifiée"

    def extract_location(self, text: str, title: str = "") -> str:
        combined = f"{title} {text}".lower()
        for city in GERMAN_CITIES:
            if city in combined:
                pattern = r'\b' + re.escape(city) + r'\b'
                if re.search(pattern, combined):
                    return city.title()
        full_text = f"{title} {text}"
        for pattern in [r'\bin\s+([A-ZÄÖÜ][a-zäöüß]+)', r'\bRegion\s+([A-ZÄÖÜ][a-zäöüß]+)']:
            match = re.search(pattern, full_text)
            if match and match.group(1).lower() not in ['der', 'die', 'das', 'und']:
                return match.group(1)
        archive_match = re.search(r'(?:Stadtarchiv|Landesarchiv|Archiv)\s+([A-ZÄÖÜ][a-zäöüß-]+)', text)
        if archive_match:
            return f"(Archive: {archive_match.group(1)})"
        return "Non spécifié"

    def extract_institution(self, text: str) -> str:
        for pattern in [r'((?:Stadt|Landes|Bundes)?[Aa]rchiv[^,\n]+)']:
            match = re.search(pattern, text)
            if match:
                return match.group(1).strip()
        return ""

    def parse_list_item(self, item_html: str) -> Optional[Initiative]:
        soup = BeautifulSoup(item_html, 'html.parser')
        link = soup.find('a', href=re.compile(r'/item/'))
        if not link:
            return None
        titre = link.get_text(strip=True)
        url = urljoin(BASE_URL, link.get('href', ''))
        full_text = soup.get_text(' ', strip=True)
        meta_text = full_text.replace(titre, '', 1).strip()
        return Initiative(
            titre=titre,
            periode=self.extract_date(meta_text),
            lieu=self.extract_location(meta_text, titre),
            url=url,
            institution=self.extract_institution(meta_text)
        )

    async def parse_list_page(self, html: str) -> list[Initiative]:
        soup = BeautifulSoup(html, 'html.parser')
        results = []
        for link in soup.find_all('a', href=re.compile(r'/item/')):
            parent = link.find_parent(['li', 'div', 'article', 'tr'])
            item_html = str(parent) if parent else str(link.parent) if link.parent else str(link)
            initiative = self.parse_list_item(item_html)
            if initiative:
                results.append(initiative)
        return results

    def add_result(self, initiative: Initiative) -> bool:
        key = initiative.hash_key()
        if key in self.seen_hashes:
            self.duplicates += 1
            return False
        self.seen_hashes.add(key)
        self.results.append(initiative)
        return True

    async def scrape_all(self) -> list[Initiative]:
        print("Récupération du nombre de résultats...")
        total = await self.get_total_results()
        if total == 0:
            print("Erreur: impossible de récupérer les résultats")
            return []
        print(f"{total} résultats trouvés")
        print("\nExtraction en cours...")

        pages = (total + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
        urls = [f"{SEARCH_URL}?lang=en&query={quote(QUERY)}&offset={i * ROWS_PER_PAGE}&rows={ROWS_PER_PAGE}" for i in range(pages)]

        pbar = tqdm(total=total, desc="Progression")

        async def process_page(url: str):
            html = await self.fetch(url)
            if html:
                items = await self.parse_list_page(html)
                for item in items:
                    if self.add_result(item):
                        pbar.update(1)

        for i in range(0, len(urls), 10):
            batch = urls[i:i + 10]
            await asyncio.gather(*[process_page(url) for url in batch])

        pbar.close()
        print(f"\n{'='*50}")
        print(f"Terminé: {len(self.results)} initiatives extraites")
        if self.duplicates > 0:
            print(f"Doublons ignorés: {self.duplicates}")
        if self.errors > 0:
            print(f"Erreurs: {self.errors}")
        print(f"{'='*50}")
        return self.results

print("Code chargé !")

In [None]:
#@title 3. Lancer l'extraction (~2-3 minutes)
async def run_scraper():
    async with ArchivportalScraper() as scraper:
        await scraper.scrape_all()
        return scraper.results

results = await run_scraper()

In [None]:
#@title 4. Télécharger le fichier CSV
from google.colab import files

# Créer le CSV
filename = "burgerinitiativen.csv"
with open(filename, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['titre', 'periode', 'lieu', 'institution', 'url'])
    writer.writeheader()
    for init in results:
        writer.writerow(init.to_dict())

print(f"Fichier créé: {filename}")
print(f"Nombre de lignes: {len(results)}")
print("\nTéléchargement automatique...")

# Télécharger automatiquement
files.download(filename)

In [None]:
#@title 5. (Optionnel) Aperçu des données
import pandas as pd
df = pd.DataFrame([r.to_dict() for r in results])
df.head(20)