# **Scraping Amazon Products Data**

In [1]:
# important libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import random

## Making a Request to the Search Page and Fetching Each Product Sequentially

Defining function for extracting product data like Product Title, Price Reviews, Rating etc

In [2]:
# Function to extract Title of Product
def get_title(s):

  try:
    title = s.find('span', attrs={'id' : 'productTitle'}).text.strip()

  except:
    title = ''

  return title

# Function to extract Product Price
def get_price(s):

  try:
    price = s.find('span', attrs={'class' : 'aok-offscreen'}).text.strip()

  except:
    price = "N/A"

  return price

# Function to extract Product Brand Name
def get_brand(s):

  try:
    brand_str = s.find('a', attrs={'id' : 'bylineInfo'}).text
    brand_name = brand_str.replace('Visit the ', '')

  except:
    brand_name = "N/A"

  return brand_name

# Function to extract Bought
def get_bought(s):

  try:
    tag = s.find_all("span", id='social-proofing-faceout-title-tk_bought')[0]
    bought = tag.find('span', class_='a-text-bold').get_text(strip=True)

  except:
    bought = 'N/A'

  return bought

# Function to extract Number of Reviews
def get_review_count(s):

  try:
    review_count = s.find('span', attrs={'id': 'acrCustomerReviewText'}).text

  except:
    review_count = 'N/A'

  return review_count

# Function to extract Rating
def get_rating(s):

  try:
    rating = s.find('span', attrs={'class' : 'reviewCountTextLinkedHistogram'}).get('title')

  except:
    rating = 'N/A'

  return rating

In [5]:
# Function to get data from Amazon
def get_data(url):
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
      "Accept-Language": "en-US,en;q=0.9",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
      "Referer": "https://www.google.com/",
  }

  webpage = requests.get(url, headers=headers)

  soup = BeautifulSoup(webpage.content, 'html.parser')

  products = soup.find_all('div', {'data-component-type' : 's-search-result'})

  d = {'asin' : [], 'product_url' : [], 'title' : [], 'price' : [], 'brand_name' : [], 'bought' : [], 'review_count' : [], 'rating' : []}

  for item in products :
    d['asin'].append(item.get('data-asin'))

  for i in d['asin']:
    d['product_url'].append('http://www.amazon.com/dp/' + i)

  for URL in d['product_url']:
    product_webpage = requests.get(URL, headers=headers)

    product_soup = BeautifulSoup(product_webpage.content, 'html.parser')

    d['title'].append(get_title(product_soup))
    d['price'].append(get_price(product_soup))
    d['brand_name'].append(get_brand(product_soup))
    d['bought'].append(get_bought(product_soup))
    d['review_count'].append(get_review_count(product_soup))
    d['rating'].append(get_rating(product_soup))

  df = pd.DataFrame(d)

  return df

url = 'https://www.amazon.com/s?k=stainless+steel+water+bottles&page=1' # Target URL

df = get_data(url)

In [9]:
df.shape

(48, 8)

In [10]:
df.sample(10)

Unnamed: 0,asin,product_url,title,price,brand_name,bought,review_count,rating
36,B0BVZD4WFN,http://www.amazon.com/dp/B0BVZD4WFN,Zak Designs Minecraft Water Bottle for Travel ...,$14.99 with 38 percent savings,Zak Designs Store,1K+ bought,"2,991 ratings",4.7 out of 5 stars
39,B09RNBRTQ6,http://www.amazon.com/dp/B09RNBRTQ6,"Contigo Jackson Chill 2.0, Vacuum-Insulated St...",$17.99,Contigo Store,800+ bought,"5,159 ratings",4.5 out of 5 stars
43,B0CFP9YQ2M,http://www.amazon.com/dp/B0CFP9YQ2M,Stanley IceFlow Fast Flow Water Bottle | Angle...,,STANLEY Store,,"5,668 ratings",4.5 out of 5 stars
47,B0BZZT7D5V,http://www.amazon.com/dp/B0BZZT7D5V,"Insulated Water Bottle 64 oz, Triple Wall Vacu...",$29.99,RAYMYLO Store,2K+ bought,"12,200 ratings",4.7 out of 5 stars
1,B0BZYC7N7X,http://www.amazon.com/dp/B0BZYC7N7X,Owala FreeSip Insulated Stainless Steel Water ...,$29.99,Owala Store,6K+ bought,"92,047 ratings",4.7 out of 5 stars
4,B0D8J2ZB8P,http://www.amazon.com/dp/B0D8J2ZB8P,POWCAN 26 oz Insulated Water Bottle with 2-in-...,$15.99 with 6 percent savings,POWCAN Store,10K+ bought,"5,095 ratings",4.6 out of 5 stars
16,B09LLZNJYV,http://www.amazon.com/dp/B09LLZNJYV,CIVAGO 32 oz Insulated Water Bottle With Straw...,$14.24 with 16 percent savings,CIVAGO Store,3K+ bought,"10,179 ratings",4.6 out of 5 stars
26,B072QRW9KH,http://www.amazon.com/dp/B072QRW9KH,Takeya Actives Insulated Stainless Steel Water...,$26.91 with 23 percent savings,Takeya Store,600+ bought,"30,921 ratings",4.7 out of 5 stars
28,B07YXM6XTF,http://www.amazon.com/dp/B07YXM6XTF,Hydro Flask Water Bottle - Insulated Stainless...,$35.26 with 22 percent savings,Hydro Flask Store,1K+ bought,"28,173 ratings",4.7 out of 5 stars
37,B09P2P3D1Z,http://www.amazon.com/dp/B09P2P3D1Z,MEWAY 17oz Sport Water Bottle 8 Pack Vacuum In...,$49.99 with 23 percent savings,MEWAY Store,,"1,962 ratings",4.6 out of 5 stars


---

## Fetching Product Data from the Search Page

Requesting each product's page and scraping it is time-consuming. The search results page often already contains key information (title, price, review count, rating, etc.), so scraping directly from the search page lets us avoid making extra requests for each product and also speeds up the process.

In [11]:
# Function to extract ASIN
def get_asin_url(s):
  try:
    asin = s.get('data-asin')
    url = 'https://www.amazon.com/dp/' + asin

  except:
    asin = 'N/A'
    url = 'N/A'

  return [asin, url]

# Function to extract Title of Product
def get_title(s):

  try:
    title = s.find('h2', class_='a-size-base-plus a-spacing-none a-color-base a-text-normal').text

  except:
    title = 'N/A'

  return title

# Function to extract Product Price
def get_price(s):

  try:
    price = s.find('span', attrs={'class' : 'a-offscreen'}).text.strip()

  except:
    price = "N/A"

  return price

# Function to extract Bought
def get_bought(s):

  try:
    bought = s.find('span', attrs={'class' : 'a-size-base a-color-secondary'}).text

  except:
    bought = 'N/A'

  return bought

# Function to extract Number of Reviews
def get_review_count(s):

  try:
    review_count = s.find('span', attrs={'class': 'a-size-mini puis-normal-weight-text s-underline-text'}).text.strip("()")

  except:
    review_count = 'N/A'

  return review_count

# Function to extract Ratings
def get_rating(s):

  try:
    rating = s.find('span', attrs={'class' : 'a-size-small a-color-base'}).text

  except:
    rating = 'N/A'

  return rating

In [12]:
def product_data(url):
  """Fetch product data from Amazon search results page."""
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
      "Accept-Language": "en-US,en;q=0.9",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
      "Referer": "https://www.google.com/",
  }

  webpage = requests.get(url, headers=headers)

  soup = BeautifulSoup(webpage.content, 'html.parser')

  products = soup.find_all('div', {'data-component-type' : "s-search-result"})

  d = {'asin' : [], 'product_url' : [], 'title' : [], 'price' : [], 'bought' : [], 'review_count' : [], 'rating' : []}

  for product in products:
    asin, url = get_asin_url(product)
    d['asin'].append(asin)
    d['product_url'].append(url)
    d['title'].append(get_title(product))
    d['price'].append(get_price(product))
    d['bought'].append(get_bought(product))
    d['review_count'].append(get_review_count(product))
    d['rating'].append(get_rating(product))

  df = pd.DataFrame(d)

  return df

# calling function
url = 'https://www.amazon.com/s?k=stainless+steel+water+bottles&page=1'
df = product_data(url)

In [18]:
df.sample(10)

Unnamed: 0,asin,product_url,title,price,bought,review_count,rating
54,B0FJ2X2MQ6,https://www.amazon.com/dp/B0FJ2X2MQ6,Encool Insulated Stainless Steel Water Bottle ...,$18.04,600+ bought in past month,143,4.9
9,B084PVQGH1,https://www.amazon.com/dp/B084PVQGH1,BlenderBottle Strada Shaker Cup Insulated Stai...,$19.97,10K+ bought in past month,16.5K,4.6
8,B0D2W1MKZX,https://www.amazon.com/dp/B0D2W1MKZX,"24 oz Insulated Water Bottle with Handle, 304 ...",$12.99,2K+ bought in past month,583,4.6
17,B0F9FV6GJJ,https://www.amazon.com/dp/B0F9FV6GJJ,[2-IN-1 LID] Insulated Water Bottle with Silic...,$8.98,5K+ bought in past month,515,4.6
24,B08J48WD4J,https://www.amazon.com/dp/B08J48WD4J,ThermoFlask Bottle with Spout Lid - Stainless ...,$35.99,3K+ bought in past month,13,4.8
37,B07MBBP71L,https://www.amazon.com/dp/B07MBBP71L,"Bambaw Stainless Steel Water Bottles 32 oz, No...",$17.95,800+ bought in past month,9.8K,4.3
34,B0DRWLLLM9,https://www.amazon.com/dp/B0DRWLLLM9,CamelBak Thrive Chug Insulated Stainless Steel...,$32.99,50+ bought in past month,238,4.7
5,B085DVHQ57,https://www.amazon.com/dp/B085DVHQ57,Owala FreeSip Insulated Stainless Steel Water ...,$32.99,10K+ bought in past month,92K,4.7
16,B0F9TDHNCJ,https://www.amazon.com/dp/B0F9TDHNCJ,Volhoply 32oz Stainless Steel Water Bottle wit...,$16.99,100+ bought in past month,74,4.4
26,B07Z4L5VB9,https://www.amazon.com/dp/B07Z4L5VB9,Triple Tree 26OZ Vacuum Insulated Stainless St...,$16.99,300+ bought in past month,597,4.4


## Looping through multiple pages

In [19]:
# Function to get data from multiple pages
def product_data(base_url):
  """This function fetches product data from multiple pages."""

  # List of User Agents
  user_agents = [
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/141.0.0.0 Safari/537.36",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
      "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
  ]

  # Randomly select a User Agent
  headers = {
      "User-Agent": random.choice(user_agents),
      "Accept-Language": "en-US,en;q=0.9",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
      "Referer": "https://www.google.com/",
  }

  # Empty DataFrame to store results
  df = pd.DataFrame()

  # loop through pages 1 to 5
  for i in range(1,6):
    page_url = base_url.format(i)

    webpage = requests.get(page_url, headers=headers)

    soup = BeautifulSoup(webpage.content, 'html.parser')

    products = soup.find_all('div', {'data-component-type' : "s-search-result"})

    d = {'asin' : [], 'product_url' : [], 'title' : [], 'price' : [], 'bought' : [], 'review_count' : [], 'rating' : []} # Dictionary to store data

    # loop throught the page and extract the each product details
    for product in products:
      asin, url = get_asin_url(product)
      d['asin'].append(asin)
      d['product_url'].append(url)
      d['title'].append(get_title(product))
      d['price'].append(get_price(product))
      d['bought'].append(get_bought(product))
      d['review_count'].append(get_review_count(product))
      d['rating'].append(get_rating(product))

    # temporary DataFrame for each page
    temp_df = pd.DataFrame(d)

    print(f'extracted page {i} with {len(products)} products!')

    # concatenate temp_df to main df
    df = pd.concat([df, temp_df], ignore_index=True)

    time.sleep(random.uniform(2, 5)) # random sleep

  return df # return the final DataFrame

In [28]:
# calling function
url = "https://www.amazon.com/s?k=panty+liners&page={}"
df = product_data(url)
print(df.shape)

extracted page 1 with 60 products!
extracted page 2 with 60 products!
extracted page 3 with 60 products!
extracted page 4 with 60 products!
extracted page 5 with 60 products!
(300, 7)


In [31]:
df.head(10)

Unnamed: 0,asin,product_url,title,price,bought,review_count,rating
0,B09B7L2BLJ,https://www.amazon.com/dp/B09B7L2BLJ,"Amazon Basics Daily Pantiliner, Regular Length...",$3.38,10K+ bought in past month,15.1K,4.2
1,B013AX7QJQ,https://www.amazon.com/dp/B013AX7QJQ,Veeda Natural Cotton Ultra Thin Panty Liners f...,$9.49,300+ bought in past month,2.5K,4.1
2,B09FD1B6JD,https://www.amazon.com/dp/B09FD1B6JD,SANDIS Premium 100% Organic Panty Liners - Che...,$8.99,2K+ bought in past month,599,4.2
3,B0F1TMC7D3,https://www.amazon.com/dp/B0F1TMC7D3,"MODAL Panty Liners for Women, Unscented Sanita...",$5.99,($0.60$0.60/count),75,4.4
4,B00NJNJ6O6,https://www.amazon.com/dp/B00NJNJ6O6,"Carefree Panty Liners for Women, Regular, Unwr...",$6.27,20K+ bought in past month,29.8K,4.7
5,B003VD5TMM,https://www.amazon.com/dp/B003VD5TMM,"Always Daily Fresh Thin Liners, Regular Absorb...",$8.99,40K+ bought in past month,38.5K,4.6
6,B0FSHPJH79,https://www.amazon.com/dp/B0FSHPJH79,Equate Overnight Ultra Thin Pads Liners Size 4...,$24.95,New on Amazon in past month,,
7,B00UASJJX6,https://www.amazon.com/dp/B00UASJJX6,"U by Kotex Balance Wrapped Panty Liners, Regul...",$3.50,20K+ bought in past month,32.6K,4.6
8,B09B7L2BLJ,https://www.amazon.com/dp/B09B7L2BLJ,"Amazon Basics Daily Pantiliner, Regular Length...",$3.38,10K+ bought in past month,15.1K,4.2
9,B01ARVVTYW,https://www.amazon.com/dp/B01ARVVTYW,"Carefree Panty Liners for Women, Regular, Wrap...",$3.34,30K+ bought in past month,9.8K,4.6


In [32]:
df.to_csv('amazon_panty_liners.csv')