In [1]:
import asyncio
import functools
import os
import time
from urllib.parse import urlencode

import aiohttp
import pandas as pd
import requests
from bs4 import BeautifulSoup, SoupStrainer


In [2]:
product_divs = SoupStrainer("div", attrs={"data-asin": True})

def create_urls(search):
    def create_page_url(search_term, page):
        search_params = {
            "k": search_term,
            "page": page,
        }
        amazon_url = "https://www.amazon.com/s?" + urlencode(search_params)
        return amazon_url

    return [create_page_url(search, i) for i in range(1, 21)]


async def get_page(session, url):
    async with session.get(url, ssl=False) as response:
        page = await response.text()
        return get_discounts(BeautifulSoup(page, "lxml", parse_only=product_divs))


async def gather_pages(urls):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0",
    }
    async with aiohttp.ClientSession(headers=headers) as session:
        tasks = [asyncio.ensure_future(get_page(session, url)) for url in urls]
        return await asyncio.gather(*tasks)


In [3]:
def valid_product(tag):
    return (
        tag.has_attr("data-asin")
        and tag["data-asin"] != ""
        and not tag.find("div", class_="rush-component")
    )


def get_price(tag):
    return tag.find("span", class_="a-offscreen").get_text(strip=True)


def get_original_price(tag):
    span = tag.find("span", class_="a-price a-text-price")
    if span:
        return get_price(span)


def get_discounted_price(tag):
    span = tag.find("span", class_="a-price")
    if span:
        return get_price(span)


def get_description(tag):
    span = tag.find("span", class_="a-text-normal")

    if span:
        return span.get_text(strip=True)


def get_product_link(tag):
    span = tag.find("span", class_="a-text-normal")
    if span:
        link = span.parent.get("href")
        if link[:5] == "/sspa":
            return
        return link.rsplit("/", 1)[0]


def get_discounts(soup):
    results = soup.find_all(valid_product)

    discounts = []

    for div in results:
        desc = get_description(div)
        if not desc:
            continue
        product = {
            "ASIN": div["data-asin"],
            "Description": desc,
            "link": get_product_link(div),
            "old_price": get_original_price(div),
            "current_price": get_discounted_price(div),
        }
        discounts.append(product)

    return discounts


In [4]:
search = "atx motherboard"

In [5]:
start = time.perf_counter()
res = await gather_pages(create_urls(search))
end = time.perf_counter()
print(f"Took: {end - start :.4f}sec")


Took: 6.2470sec


In [6]:

products = [item for page in res for item in page]

print(f"{len(products)} Products Discovered for {search}")

306 Products Discovered for atx motherboard


In [7]:
products[:5]

[{'ASIN': 'B09GP7P1XS',
  'Description': 'Asus ROG Strix B550-F Gaming WiFi II AMD AM4 (3rd Gen Ryzen) ATX Motherboard (PCIe 4.0,WiFi 6E, 2.5Gb LAN, BIOS Flashback, HDMI 2.1, Addressable Gen 2 RGB Header and Aura Sync)',
  'link': '/ROG-B550-F-II-Motherboard-Addressable/dp/B09GP7P1XS',
  'old_price': '$189.99',
  'current_price': '$177.99'},
 {'ASIN': 'B089CT5GDM',
  'Description': 'ASUS Prime B550-PLUS AMD AM4 Zen 3 Ryzen 5000 & 3rd Gen Ryzen ATX Motherboard (PCIe 4.0, ECC Memory, 1Gb LAN, HDMI 2.1, DisPlayPort 1.2 (4K@60HZ), Addressable Gen 2 RGB Header and Aura Sync).',
  'link': '/ASUS-B550-PLUS-Motherboard-DisPlayPort-Addressable/dp/B089CT5GDM',
  'old_price': '$119.99',
  'current_price': '$99.99'},
 {'ASIN': 'B089CWDHFZ',
  'Description': 'MSI MAG B550 TOMAHAWK Gaming Motherboard (AMD AM4, DDR4, PCIe 4.0, SATA 6Gb/s, M.2, USB 3.2 Gen 2, HDMI/DP, ATX, AMD Ryzen 5000 Series processors)',
  'link': '/MSI-MAG-B550-TOMAHAWK-Motherboard/dp/B089CWDHFZ',
  'old_price': '$159.99',
  'cur

In [8]:
df_products = pd.DataFrame(products) 
df_products.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ASIN           306 non-null    object
 1   Description    306 non-null    object
 2   link           306 non-null    object
 3   old_price      113 non-null    object
 4   current_price  254 non-null    object
dtypes: object(5)
memory usage: 12.1+ KB


In [9]:
df_products.head()

Unnamed: 0,ASIN,Description,link,old_price,current_price
0,B09GP7P1XS,Asus ROG Strix B550-F Gaming WiFi II AMD AM4 (...,/ROG-B550-F-II-Motherboard-Addressable/dp/B09G...,$189.99,$177.99
1,B089CT5GDM,ASUS Prime B550-PLUS AMD AM4 Zen 3 Ryzen 5000 ...,/ASUS-B550-PLUS-Motherboard-DisPlayPort-Addres...,$119.99,$99.99
2,B089CWDHFZ,MSI MAG B550 TOMAHAWK Gaming Motherboard (AMD ...,/MSI-MAG-B550-TOMAHAWK-Motherboard/dp/B089CWDHFZ,$159.99,$119.99
3,B0BH7GTY9C,GIGABYTE B650 AORUS Elite AX AMD B650 ATX Moth...,/GIGABYTE-B650-Warranty-EZ-Latch-Motherboard/d...,$219.99,$184.99
4,B0BRQSWSFQ,MSI PRO B760-P WiFi DDR4 ProSeries Motherboard...,/MSI-ProSeries-Motherboard-Supports-Processors...,$159.99,$149.99
