In this file all the code can be found to scrape the images off the munich website, also the grayscaling preprocessing part is added below.

Formatting of resulting file names is as follows:

0062_327_id=cp154815_badv.jpg (random munich image)

the first 4 digit number (0062) means on which page the image can be found on the munich website.\
Second number (327) is very important, is the Munich Number linked to the image\
id=cp154815, is the ID of the image as stored on the munich website\
and the last value (badv) is which folder it is from on the munich website

> This naming is used to link the images with the names and able to find the corresponding image back, the munich number is here the most important value.


The code below is our way of scraping, we go to the munich website with the cookie needed. With the python package beautifulsoup we can easily find all image tags on the website. The program also looks through the html file to find all the matching munich numbers to add to the name of the munich image. These all get scraped automatically by moving to the next page (as only 20 images are displayed on a page at a time).

All images get downloaded and put in the scraped_images folder, which later still gets grayscaled and small images removed into the scraped_images_grayscaled_big folder.

Takes a few hours to run as it is downloading 47000 images from 2388 different html pages.


In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
from skimage.metrics import structural_similarity as ssim
from skimage import io
import numpy as np
from io import BytesIO
import cv2


# Amount of pages on the munich website
AMOUNT_OF_PAGES = 2388

# The needed cookie to be able to download the images
cookies = {'PHPSESSID': 'u6h1scn6knqdj77di40e2n6uo3'}

all_sizes = []

# Directory to save images to
img_dir = "scraped_images"
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

# Filter out all keinbild images.
keinbild_path = "keinbild.jpg"
keinbild_image = io.imread(keinbild_path)

def kein_bild_test(image1, image2):
    if image1.shape != image2.shape:
        return False


    # See if it is similar to the keinbild image
    similarity_index = ssim(image1, image2, multichannel=True)
    threshold = 0.9

    # Returns true if image is a keinbild image and it will not get added to the dataset.
    return similarity_index > threshold
try:
    for i in range(0, AMOUNT_OF_PAGES):
        # keep track of progess
        print(i)
        # URL of the webpage to scrape automatically goes to the next page
        url = f"https://www.dhm.de/datenbank/ccp/dhm_ccp.php?seite=8&current={i * 20}"
        print(url)

        try:
            # Send a GET request to the webpage with cookies
            response = requests.get(url, cookies=cookies, timeout=10)
            response.raise_for_status()  # Check if the request was successful

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all image tags
            img_tags = soup.find_all('img')

            tables = soup.find_all('table', class_='karteikarte')

            index = 0
            for img in img_tags:
                img_url = img.get('src')
                
                # Handles multiple images when there are more than 1 image in the munich field
                if img_url and 'displayimg' in img_url:
                    find_id = img_url.find('id=')
                    # If there are multiple images linked to one munich number our indexing breaks. This checks for this scenario.
                    if img_url[find_id + 12] != '0':
                        index -= 1

                    # Find the munich number and add it to the image name to find it easier later on.
                    munich_no = tables[index].find_all('td', class_='value')[0].get_text(strip=True)
                    munich_no = munich_no.replace('/', '-')

                    index += 1

                    if munich_no != '-':
                        # Construct full URL

                        img_url = urljoin(url, img_url)

                        try:
                            # Get the image content with cookies
                            img_response = requests.get(img_url, cookies=cookies, timeout=10)
                            img_response.raise_for_status()

                            # Read the image as an array
                            img_array = io.imread(BytesIO(img_response.content))

                            if len(img_array.shape) == 2:
                                height, width = img_array.shape
                            else:
                                height, width, _ =  img_array.shape
                            all_sizes.append((height,width))
                            
                            # Checks if image is kein bild
                            if not kein_bild_test(img_array, keinbild_image):

                                # Extract image filename
                                find_id = img_url.find('id=')
                                img_basename = img_url[find_id: find_id + 11]

                                find_folder = img_url.find('folder=')
                                img_basename = img_basename + '_' + img_url[find_folder + 7:]
                                
                                # Start the image name with what page it can be found on the munich database
                                # FORMAT:
                                # PAGE_MUNICHNO_BASENAME
                                starting_zeros = 4 - len(str(i))
                                img_name = '0' * starting_zeros + f"{i+1}_{munich_no}_{img_basename}"

                                # Ensure the file has a .jpg extension
                                if not img_name.endswith('.jpg'):
                                    img_name += '.jpg'

                                img_path = os.path.join(img_dir, img_name)

                                # Save the image
                                with open(img_path, 'wb') as img_file:
                                    img_file.write(img_response.content)

                        except requests.exceptions.RequestException as img_err:
                            print(f"Failed to download image {img_url}: {img_err}")

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
# Can stop the program with ctrl+C
except KeyboardInterrupt:
    print("Script interrupted by user. Exiting...")

Grayscales all images in scraped images and creates grayscaled directory also removes small images


In [None]:
# Grayscales all images in scraped images and creates grayscaled directory also removes small images
def gray_scale_large(curr_path="scraped_images", new_path="scraped_images_grayscaled_big"):
    # get image directory
    imgs = os.listdir(curr_path)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    # loop over all images in directory, grayscales and removes small images. then stores them 
    for img_name in imgs:
        old_directory = os.path.join(curr_path, img_name)
        new_directory = os.path.join(new_path, img_name)
        img = cv2.imread(old_directory)

        # grayscale image
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # only store image if it is bigger than 50x50
        if img.size > 2500:
            cv2.imwrite(new_directory, img)

gray_scale_large()