In [1]:
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/128.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=chrome_options)

In [2]:
import os
import time
import requests
from urllib.parse import urlparse, urlunparse
from bs4 import BeautifulSoup

In [3]:
headers = {"User-Agent": "Mozilla/5.0"}

def download_image(url, save_path):
    try:
        resp = requests.get(url, headers=headers)
        if resp.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(resp.content)
            return True
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return False

In [4]:
def remove_query_params(url):
    parsed = urlparse(url)
    return urlunparse(parsed._replace(query=""))

In [None]:
base_save_dir = "raw_dataset"
os.makedirs(base_save_dir, exist_ok=True)

CATEGORIES = ["tabriz", "keshan", "gabbeh", "china", "berber", "afghan", "nain", "sarouk", "kerman", "isfahan"]
num_images_per_category = 100

In [None]:
for category in CATEGORIES:
    print(f"Scraping category: {category}")
    save_dir = os.path.join(base_save_dir, category)
    os.makedirs(save_dir, exist_ok=True)

    downloaded_count = 0
    page = 1

    while downloaded_count < num_images_per_category:
        url = f"https://www.rugvista.com/c/rugs/{category}?page={page}"
        driver.get(url)
        time.sleep(5)   # wait for JS to render

        soup = BeautifulSoup(driver.page_source, "html.parser")
        divs = soup.find_all("div", class_="[&>img]:aspect-[3/4]")

        if not divs:
            print(f"No more items found on page {page}")
            break

        for div in divs:
            img_tag = div.find("img")
            if img_tag and img_tag.get("src"):
                cdn_url = remove_query_params(img_tag["src"])
                img_name = f"{category}_{downloaded_count + 1}.jpg"
                save_path = os.path.join(save_dir, img_name)

                if download_image(cdn_url, save_path):
                    downloaded_count += 1

            if downloaded_count >= num_images_per_category:
                break

        page += 1

driver.quit()
print("Scraping completed!")

Scraping category: tabriz
Scraping category: keshan
Scraping category: gabbeh
Scraping category: china
Scraping category: berber
Scraping category: afghan
Scraping category: nain
Scraping category: sarouk
Scraping completed!


### Pad images to make them square and resize to 1024x1024.

In [4]:
from PIL import Image

output_dir = "./dataset"
os.makedirs(output_dir, exist_ok=True)

target_size = 1024

for sub in os.listdir(base_save_dir):
    subfolder = os.path.join(base_save_dir, sub)
    if not os.path.isdir(subfolder):
        continue
    for fname in os.listdir(subfolder):
        if not fname.lower().endswith(".jpg"):
            continue
        path = os.path.join(subfolder, fname)
        img = Image.open(path).convert("RGB")
        
        w, h = img.size
        if w > h:
            new_w = target_size
            new_h = int(h * target_size / w)
        else:
            new_h = target_size
            new_w = int(w * target_size / h)
        img = img.resize((new_w, new_h), Image.BICUBIC)
        
        # Create square canvas and paste image centered
        square_img = Image.new("RGB", (target_size, target_size), (255,255,255))  # white padding
        paste_x = (target_size - new_w) // 2
        paste_y = (target_size - new_h) // 2
        square_img.paste(img, (paste_x, paste_y))
        
        # Save
        out_path = os.path.join(output_dir, fname)
        square_img.save(out_path)