In [23]:
"""
Copied from [duckduckgo_search](https://github.com/deedy5/duckduckgo_search/blob/main/duckduckgo_search/duckduckgo_search.py)

Has to be included in-line because add-ons are weak.
"""

import json
import logging
import re
import time
from collections import deque
from dataclasses import dataclass
from datetime import datetime
from decimal import Decimal
from html import unescape
from itertools import cycle, islice
from pprint import pp
from random import choice
from time import sleep
from typing import Deque, Dict, Iterator, Optional, Set, Tuple, Union
from urllib.parse import unquote

import requests

logger = logging.getLogger(__name__)

REGEX_500_IN_URL = re.compile(r"[0-9]{3}-[0-9]{2}.js")
REGEX_STRIP_TAGS = re.compile("<.*?>")

USERAGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
]


class DDGS:
    """DuckDuckgo_search class to get search results from duckduckgo.com"""

    def __init__(self, headers=None, proxies=None, timeout=10) -> None:
        if headers is None:
            headers = {
                "User-Agent": choice(USERAGENTS),
                "Referer": "https://duckduckgo.com/",
            }
        self._session = requests.Session()
        self._session.headers.update(headers)
        if proxies:
            self._session.proxies.update(proxies)
        self._session.timeout = timeout

    def __enter__(self) -> "DDGS":
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        self._session.close()

    def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
        for i in range(3):
            try:
                if method.lower() == "get":
                    resp = self._session.get(url, **kwargs)
                else:  # for "POST"
                    resp = self._session.post(url, **kwargs)
                
                if self._is_500_in_url(str(resp.url)) or resp.status_code == 202:
                    raise requests.exceptions.HTTPError("")
                resp.raise_for_status()
                if resp.status_code == 200:
                    return resp
            except Exception as ex:
                logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}")
                pp(resp.request.__dict__)
                pp(resp.__dict__)

                if i >= 2 or "418" in str(ex):
                    raise ex
            sleep(3)
        return None


    def _get_vqd(self, keywords: str) -> Optional[str]:
        """Get vqd value for a search query."""
        resp = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
        if resp:
            for c1, c2 in (
                (b'vqd="', b'"'),
                (b"vqd=", b"&"),
                (b"vqd='", b"'"),
            ):
                try:
                    start = resp.content.index(c1) + len(c1)
                    end = resp.content.index(c2, start)
                    return resp.content[start:end].decode()
                except ValueError:
                    logger.warning(f"_get_vqd() keywords={keywords} vqd not found")
        return None

    def _is_500_in_url(self, url: str) -> bool:
        """something like '506-00.js' inside the url"""
        return bool(REGEX_500_IN_URL.search(url))

    def _normalize(self, raw_html: str) -> str:
        """strip HTML tags"""
        if raw_html:
            return unescape(re.sub(REGEX_STRIP_TAGS, "", raw_html))
        return ""

    def _normalize_url(self, url: str) -> str:
        """unquote url and replace spaces with '+'"""
        if url:
            return unquote(url).replace(" ", "+")
        return ""

    def images(
        self,
        keywords: str,
        region: str = "wt-wt",
        safesearch: str = "moderate",
        timelimit: Optional[str] = None,
        size: Optional[str] = None,
        color: Optional[str] = None,
        type_image: Optional[str] = None,
        layout: Optional[str] = None,
        license_image: Optional[str] = None,
        num_results: int = 5,
    ) -> Iterator[Dict[str, Optional[str]]]:
        """DuckDuckGo images search. Query params: https://duckduckgo.com/params

        Args:
            keywords: keywords for query.
            region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
            safesearch: on, moderate, off. Defaults to "moderate".
            timelimit: Day, Week, Month, Year. Defaults to None.
            size: Small, Medium, Large, Wallpaper. Defaults to None.
            color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
                Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
            type_image: photo, clipart, gif, transparent, line.
                Defaults to None.
            layout: Square, Tall, Wide. Defaults to None.
            license_image: any (All Creative Commons), Public (PublicDomain),
                Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
                Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
                Use Commercially). Defaults to None.

        Yields:
            dict with image search results.

        """
        assert keywords, "keywords is mandatory"

        vqd = self._get_vqd(keywords)
        assert vqd, "error in getting vqd"

        safesearch_base = {"on": 1, "moderate": 1, "off": -1}
        timelimit = f"time:{timelimit}" if timelimit else ""
        size = f"size:{size}" if size else ""
        color = f"color:{color}" if color else ""
        type_image = f"type:{type_image}" if type_image else ""
        layout = f"layout:{layout}" if layout else ""
        license_image = f"license:{license_image}" if license_image else ""

        payload = {
            "l": region,
            "o": "json",
            "s": 0,
            "q": keywords,
            "vqd": vqd,
            "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
            "p": safesearch_base[safesearch.lower()],
        }

        cache = set()

        i = 0
        while i < num_results:
            # resp = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
            resp = requests.get("https://duckduckgo.com/i.js", params=payload)
            if resp is None:
                break
            elif resp.status_code != 200:
                raise Exception(f"status_code={resp.status_code}")
            try:
                resp_json = resp.json()
            except Exception:                
                break
            page_data = resp_json.get("results", None)
            if page_data is None:
                break

            result_exists = False
            for row in page_data:
                image_url = row.get("image", None)
                if image_url and image_url not in cache:
                    cache.add(image_url)
                    result_exists = True
                    yield {
                        "title": row["title"],
                        "image": self._normalize_url(image_url),
                        "thumbnail": self._normalize_url(row["thumbnail"]),
                        "url": self._normalize_url(row["url"]),
                        "height": row["height"],
                        "width": row["width"],
                        "source": row["source"],
                    }
                    i += 1

            next = resp_json.get("next", None)

            if next:
                payload["s"] = next.split("s=")[-1].split("&")[0]
            if next is None or result_exists is False:
                break

    
def get_images(keywords: str, num_images: int) -> list:
    with DDGS() as ddgs:
        return [r["url"] for r in ddgs.images(
            keywords,
            region="en",
            safesearch="Off",   
            size=None,
            type_image=None,
            layout=None,
            license_image=None,
            num_results=num_images
        )]       
    

In [24]:
get_images("hat", 5)

Exception: status_code=403