In [2]:
!pip install beautifulsoup4 lxml html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.2/112.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: html5lib
Successfully installed html5lib-1.1


In [20]:
from bs4 import BeautifulSoup 

sitemap_file = "data/wood-database/post-sitemap.xml"
woods = {}
image_counts = {}

# Parse xml sitemap file
with open(sitemap_file, "r") as f:
    xml = f.read()
    soup = BeautifulSoup(xml, "xml")
    for url in soup.find_all("url"):
        wood_page = url.find("loc").text
        wood_name = wood_page.split("/")[-2]
        images = [image.text.strip() for image in url.find_all("image:image")]
        woods[wood_name] = {
            "page": wood_page,
            "images": images
        }
        for image in images:
            image_counts[image] = image_counts.get(image, 0) + 1

# Print all images that are used more than once, ordered by number of uses
# for image, count in sorted(image_counts.items(), key=lambda x: x[1], reverse=True):
#     if count > 1:
#         print(f"{image} ({count})")


# Remove all images that are used more than once
for wood in woods.values():
    wood["images"] = [image for image in wood["images"] if image_counts[image] == 1]

# Remove all woods that have no images
woods = {wood_name: wood for wood_name, wood in woods.items() if len(wood["images"]) > 0}

# Save to json file
import json
with open("data/wood-database/woods.json", "w") as f:
    json.dump(woods, f, indent=2)

In [21]:
# load the woods from the json file
import json
with open("data/wood-database/woods.json", "r") as f:
    woods = json.load(f)

# Print the number of woods and images
print(f"Number of woods: {len(woods)}")
print(f"Number of images: {sum(len(wood['images']) for wood in woods.values())}")


Number of woods: 497
Number of images: 1867


In [22]:
# For each wood, download the images and save them to a folder with the wood name
import os
import requests
from tqdm import tqdm

for wood_name, wood in tqdm(woods.items()):
    wood_folder = os.path.join("data/wood-database/woods", wood_name)
    os.makedirs(wood_folder, exist_ok=True)
    for image_url in wood["images"]:
        image_name = image_url.split("/")[-1]
        image_path = os.path.join(wood_folder, image_name)
        if not os.path.exists(image_path):
            response = requests.get(image_url)
            with open(image_path, "wb") as f:
                f.write(response.content)

100%|██████████| 497/497 [08:44<00:00,  1.05s/it]
