In [1]:
import os
import re
import time
import requests
from urllib.parse import urlparse, unquote

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager



In [2]:

def init_driver(download_dir="newspaper_downloads"):
    """
    Initializes the download directory and sets up the Selenium Chrome driver.
    
    Returns:
        driver: Selenium webdriver instance.
        wait: WebDriverWait instance for the driver.
    """
    os.makedirs(download_dir, exist_ok=True)
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run headless if you don't need to see the browser
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )
    wait = WebDriverWait(driver, 20)
    return driver, wait



In [3]:

def extract_jp2_from_manifest_data(manifest_url):
    """
    Extract JP2 file URLs directly from the manifest data.
    
    Args:
        manifest_url (str): The URL for the manifest.
    
    Returns:
        list: List of JP2 file URLs.
    """
    try:
        print(f"Fetching manifest data from: {manifest_url}")
        headers = {
            'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/91.0.4472.124 Safari/537.36'),
            'Accept': 'application/json, text/plain, */*',
            'Referer': 'https://tidningar.kb.se/',
        }
        
        response = requests.get(f"{manifest_url}/manifest", headers=headers)
        response.raise_for_status()
        
        manifest_data = response.json()
        jp2_urls = []
        
        # Extract JP2 URLs from the manifest items
        if 'items' in manifest_data:
            for canvas in manifest_data['items']:
                if 'items' in canvas:
                    for annotation_page in canvas['items']:
                        if 'items' in annotation_page:
                            for annotation in annotation_page['items']:
                                if 'body' in annotation and 'id' in annotation['body']:
                                    body_id = annotation['body']['id']
                                    if body_id.endswith('.jp2'):
                                        jp2_urls.append(body_id)
        
        return jp2_urls
    
    except Exception as e:
        print(f"Error extracting JP2 files from manifest: {e}")
        return []


def download_file(url, filepath):
    """
    Download a file from URL to the specified filepath.
    
    Args:
        url (str): URL of the file to download.
        filepath (str): Destination file path.
        
    Returns:
        bool: True if download is successful, otherwise False.
    """
    try:
        print(f"Downloading {url} to {filepath}")
        
        # Skip download if file already exists
        if os.path.exists(filepath):
            print(f"File already exists: {filepath}")
            return True
        
        headers = {
            'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/91.0.4472.124 Safari/537.36'),
            'Referer': 'https://tidningar.kb.se/',
            'Accept': 'image/jpeg, image/png, image/jp2, */*'
        }
        
        # Clean the URL in case of malformed characters
        clean_url = url.replace('\\\\', '/')
        
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.get(clean_url, headers=headers, stream=True, timeout=30)
                response.raise_for_status()
                
                # Create the directory if it doesn't exist
                os.makedirs(os.path.dirname(filepath), exist_ok=True)
                
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                print(f"Downloaded: {filepath}")
                return True
                
            except (requests.exceptions.RequestException, requests.exceptions.Timeout) as req_err:
                if attempt < max_retries - 1:
                    print(f"Retry {attempt+1}/{max_retries} downloading {clean_url}: {req_err}")
                    time.sleep(2)  # Wait before retrying
                else:
                    raise
                
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False



In [6]:

def scrape_by_date_range(driver, wait, download_dir, start_date, end_date, paper_id=None):
    """
    Scrape newspapers within a given date range.
    
    Args:
        driver: Selenium webdriver instance.
        wait: WebDriverWait instance.
        download_dir (str): Base directory to save downloads.
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.
        paper_id (str, optional): Optional paper ID filter.
    """
    base_url = "https://tidningar.kb.se/search?q=%2a"
    url = f"{base_url}&from={start_date}&to={end_date}"
    
    # Add paper filter if provided, otherwise use default
    if paper_id:
        url += f"&isPartOf.%40id={paper_id}"
    else:
        url += "&isPartOf.%40id=https%3A%2F%2Flibris.kb.se%2Fm5z2w4lz3m2zxpk%23it"
    
    print(f"Using search URL: {url}")
    
    driver.get(url)
    time.sleep(3)  # Allow page to load

    # Find and process each newspaper result
    results = wait.until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.search-result-item"))
    )
    
    print(f"Found {len(results)} newspaper issues")
    
    for i, result in enumerate(results):
        try:
            # Extract date and title before clicking
            newspaper_title = None
            newspaper_date = None
            
            date_elements = result.find_elements(By.CSS_SELECTOR, "div.date-text")
            if date_elements:
                newspaper_date = date_elements[0].text.strip()
            
            title_elements = result.find_elements(By.CSS_SELECTOR, "div.search-result-title")
            if title_elements:
                newspaper_title = title_elements[0].text.strip()
            
            print(f"Processing issue {i+1}/{len(results)}: {newspaper_title} - {newspaper_date}")
            
            # Click on the result to open the newspaper issue
            result.click()
            time.sleep(3)  # Wait for the issue page to load
            
            current_url = driver.current_url
            
            try:
                print("Extracting manifest data from page source")
                page_source = driver.page_source
                manifest_pattern = r'"id":"(https:\\\/\\\/data\.kb\.se\\\/([^\/\\]+)\\\/manifest)"'
                match = re.search(manifest_pattern, page_source)
                
                if match:
                    manifest_id = match.group(2)  # Extract the ID portion
                    print(f"Found manifest ID: {manifest_id}")
                    manifest_url = f"https://data.kb.se/{manifest_id}"
                else:
                    # Alternative extraction method from URL path
                    parsed_url = urlparse(current_url)
                    path_parts = parsed_url.path.split('/')
                    if len(path_parts) > 1:
                        manifest_id = path_parts[1]
                        print(f"Extracted manifest ID from URL: {manifest_id}")
                        manifest_url = f"https://data.kb.se/{manifest_id}"
                    else:
                        raise Exception("Could not extract manifest ID")
                    
                    # Navigate to the manifest page
                    driver.get(manifest_url)
                    time.sleep(2)
                    
                    # If newspaper info is missing, attempt to extract from the page title
                    if not newspaper_date or not newspaper_title:
                        page_title = driver.title
                        title_match = re.search(r'([^|]+)', page_title)
                        if title_match:
                            combined_title = title_match.group(1).strip()
                            parts = combined_title.split()
                            if len(parts) >= 2:
                                newspaper_title = ' '.join(parts[:-1])
                                newspaper_date = parts[-1]
                    
                    # Clean newspaper_title for folder naming
                    newspaper_title = re.sub(r'[^\w\s-]', '', newspaper_title).strip() if newspaper_title else "Unknown"
                    newspaper_date = newspaper_date.replace('/', '-') if newspaper_date else "Unknown_Date"
                    
                    # Create folder for this newspaper issue
                    folder_path = os.path.join(download_dir, newspaper_title, newspaper_date)
                    os.makedirs(folder_path, exist_ok=True)
                    
                    # Try different methods to extract JP2 file URLs
                    
                    # Method 1: Extract from page source JSON data
                    page_source = driver.page_source
                    jp2_pattern = r'"id":"(https:\\\/\\\/data\.kb\.se\\\/[^"]+\.jp2)"'
                    jp2_matches = re.findall(jp2_pattern, page_source)
                    jp2_urls = [url.replace('\/', '/') for url in jp2_matches]
                    
                    print(f"Method 1: Found {len(jp2_urls)} JP2 files in page source")
                    
                    # Method 2: Use the manifest data extraction function if needed
                    if not jp2_urls:
                        jp2_urls = extract_jp2_from_manifest_data(manifest_url)
                        print(f"Method 2: Found {len(jp2_urls)} JP2 files from manifest data")
                    
                    # Method 3: Look for links on the page if still nothing
                    if not jp2_urls:
                        jp2_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='.jp2']")
                        if jp2_links:
                            jp2_urls = [link.get_attribute('href') for link in jp2_links]
                            print(f"Method 3: Found {len(jp2_urls)} JP2 files from page links")
                    
                    # Download each JP2 file
                    for file_url in jp2_urls:
                        # Clean up the URL formatting
                        if '\\' in file_url:
                            file_url = file_url.replace('\\', '')
                        filename = os.path.basename(unquote(file_url))
                        file_path = os.path.join(folder_path, filename)
                        download_file(file_url, file_path)
                        
            except Exception as e:
                print(f"Error processing manifest: {e}")
            
            # Return to the search results
            driver.back()
            time.sleep(2)
            
            # If necessary, re-navigate to the search page
            if "search" not in driver.current_url:
                driver.get(url)
                time.sleep(3)
            
            # Refresh the results list
            results = wait.until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.search-result-item"))
            )
            
        except Exception as e:
            print(f"Error processing newspaper issue: {e}")
            driver.get(url)
            time.sleep(3)
            results = wait.until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.search-result-item"))
            )



  jp2_urls = [url.replace('\/', '/') for url in jp2_matches]


In [5]:

# Example usage in a notebook:
if __name__ == "__main__":
    # Initialize the driver and create the download directory "kb_newspapers"
    driver, wait = init_driver(download_dir="kb_newspapers")
    
    try:
        # Test the scraper for newspapers between 1865-01-02 and 1865-01-03
        scrape_by_date_range(driver, wait, "kb_newspapers", "1865-01-02", "1865-01-03")
    finally:
        # Close the driver when done
        driver.quit()


Using search URL: https://tidningar.kb.se/search?q=%2a&from=1865-01-02&to=1865-01-03&isPartOf.%40id=https%3A%2F%2Flibris.kb.se%2Fm5z2w4lz3m2zxpk%23it
Found 2 newspaper issues
Processing issue 1/2: None - None
Extracting manifest data from page source
Extracted manifest ID from URL: l4x2vhvx1jj0mch
Method 1: Found 0 JP2 files in page source
Fetching manifest data from: https://data.kb.se/l4x2vhvx1jj0mch
Error extracting JP2 files from manifest: 404 Client Error: Not Found for url: https://data.kb.se/l4x2vhvx1jj0mch/manifest
Method 2: Found 0 JP2 files from manifest data
Error processing newspaper issue: Message: stale element reference: stale element not found
  (Session info: chrome=134.0.6998.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x0000000100631804 cxxbridge1$str$ptr + 2785964
1   chromedriver                       