# Scrapping Amazon Data for Sony Headphones

### Importing Libraries

In [65]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import datetime
import numpy as np

### Connecting to website and pulling the data

In [66]:
URL = "https://www.amazon.in/s?k=headphones&rh=n%3A976419031%2Cp_123%3A237204&dc&ds=v1%3ASvjauA1e6BezhxEzPlsjSqYx6fpv9Sa5vXUUbzRh3JE&crid=2LITCFDFEKL0P&qid=1733751999&rnid=91049095031&sprefix=headphone%2Caps%2C193&ref=sr_nr_p_123_4"
Header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=Header)
page


<Response [200]>

In [67]:
soup1 = BeautifulSoup(page.content, "html.parser")
soup = BeautifulSoup(soup1.prettify(), "html.parser")

## Now let's make functions to make our work efficient and less redundant

#### Function to fetch HTML content and return beautifulSoup object

In [68]:
def get_soup(url):
    response = requests.get(url, headers=Header)
    return BeautifulSoup(response.content, "html.parser")

#### Function to fetch links of the products from one page

In [69]:
BASE_URL = "https://www.amazon.in/"

# To store all product links
all_product_links = []

In [70]:
def extract_product_links(soup):
    product_links = []
    product_tags = soup.find_all("a", class_="a-link-normal s-no-outline")
    
    for tag in product_tags:
        href = tag.get("href")
        if href:
            # Append the base URL to create a full link
            product_links.append(BASE_URL + href)
    
    return product_links

#### Function to fetch the links of next page, if available

In [71]:
def get_next_page_url(soup):
    next_button = soup.find("a", class_="s-pagination-next")
    if next_button and "href" in next_button.attrs:
        # Return the full URL for the next page
        return BASE_URL + next_button["href"]
    return None

#### Function to scrape links from each page, one by one and adding it to one common list so that we can fetch data from each product by looping through that variable

In [72]:
def scrape_all_pages(start_url):
    current_url = start_url
    while current_url:
        print(f"Scraping: {current_url}")
        
        # Get the soup object for the current page
        soup = get_soup(current_url)
        
        # Extract product links and add them to the master list
        all_product_links.extend(extract_product_links(soup))
        
        # Find the next page URL, if available
        current_url = get_next_page_url(soup)
        
        # Wait to avoid overloading the server
        time.sleep(2)


### Running the scrape_all_pages(URL) function to get the links of all the products

In [73]:
scrape_all_pages(URL)

Scraping: https://www.amazon.in/s?k=headphones&rh=n%3A976419031%2Cp_123%3A237204&dc&ds=v1%3ASvjauA1e6BezhxEzPlsjSqYx6fpv9Sa5vXUUbzRh3JE&crid=2LITCFDFEKL0P&qid=1733751999&rnid=91049095031&sprefix=headphone%2Caps%2C193&ref=sr_nr_p_123_4
Scraping: https://www.amazon.in//s?k=headphones&i=electronics&rh=n%3A976419031%2Cp_123%3A237204&dc&page=2&crid=2LITCFDFEKL0P&qid=1733765301&rnid=91049095031&sprefix=headphone%2Caps%2C193&ref=sr_pg_1
Scraping: https://www.amazon.in//s?k=headphones&i=electronics&rh=n%3A976419031%2Cp_123%3A237204&dc&page=3&crid=2LITCFDFEKL0P&qid=1733765305&rnid=91049095031&sprefix=headphone%2Caps%2C193&ref=sr_pg_2
Scraping: https://www.amazon.in//s?k=headphones&i=electronics&rh=n%3A976419031%2Cp_123%3A237204&dc&page=4&crid=2LITCFDFEKL0P&qid=1733765309&rnid=91049095031&sprefix=headphone%2Caps%2C193&ref=sr_pg_3
Scraping: https://www.amazon.in//s?k=headphones&i=electronics&rh=n%3A976419031%2Cp_123%3A237204&dc&page=5&crid=2LITCFDFEKL0P&qid=1733765312&rnid=91049095031&sprefix=hea

In [74]:
# all_product_links

#### Now, we have links to all the product in the headphoen category for Sony Brand in India. Using this list, let's get the details of each product like - Name, Original MRP, MRP after Discount, Ratings, No of People who rated, sold by

#### Function to fetch details about each product based on the links stored together 

#### To handle NoneObject type error we added the try and except to the function

In [82]:
def fetch_product_details(product_url):
    try:
        # Fetch the page content
        soup = get_soup(product_url)
        
        # Extract required details using CSS selectors
        product_details = {}

        # Link of the Product
        product_details["Product Link"] = product_url
        
        # Name of the product
        product_name = soup.find("span", {"id": "productTitle"})
        product_details["Name"] = product_name.get_text(strip=True) if product_name else np.nan

        # Original MRP (if available)
        original_mrp = soup.find("span", class_="a-price a-text-price")
        product_details["Original MRP"] = (
            original_mrp.find("span", class_="a-offscreen").get_text(strip=True)
            if original_mrp and original_mrp.find("span", class_="a-offscreen")
            else np.nan
        )

        # MRP after discount
        discounted_mrp = soup.find("span", class_="a-price-whole")
        product_details["MRP After Discount"] = (
            discounted_mrp.get_text(strip=True) if discounted_mrp else np.nan
        )

        # Ratings
        rating_tag = soup.find("a", class_="a-popover-trigger a-declarative")
        product_details["Ratings"] = (
            rating_tag.find("span", class_="a-size-base a-color-base").get_text(strip=True)
            if rating_tag and rating_tag.find("span", class_="a-size-base a-color-base")
            else np.nan
        )

        # Number of people who rated
        num_ratings = soup.find("span", {"id": "acrCustomerReviewText"})
        product_details["Number of Ratings"] = (
            num_ratings.get_text(strip=True) if num_ratings else np.nan
        )

        # Ratings link
        rating_link = soup.find("a", {"id": "acrCustomerReviewLink"})
        product_details["Ratings Link"] = (
            product_url.split('?')[0] + rating_link["href"] if rating_link else np.nan
        )

        # Sold by
        sold_by_tag = soup.find("a", {"id": "sellerProfileTriggerId"})
        product_details["Sold By"] = (
            sold_by_tag.get_text(strip=True) if sold_by_tag else np.nan
        )

        return product_details

    except Exception as e:
        # Handle exceptions and return error message for the product
        return {"Error": f"Failed to fetch product details: {str(e)}"}


#### Function to fetch all the data from the links, this function will call the fetch_product_details() function to get all the product data

In [77]:
def fetch_all_products_details(all_product_links):
    all_products_data = []
    
    for index, product_url in enumerate(all_product_links):
        print(f"Fetching product {index + 1} of {len(all_product_links)}: {product_url}")
        product_data = fetch_product_details(product_url)
        all_products_data.append(product_data)
        
        # Add a polite delay between requests
        time.sleep(2)
    
    return all_products_data

##### The above function is responsible for iterating through a list of product URLs 

calling fetch_all_product_details() functiona nd passing the list of links of products to fetch the information

In [78]:
all_products_data = fetch_all_products_details(all_product_links)

Fetching product 1 of 97: https://www.amazon.in//b?node=20930687031&pd_rd_w=6xUH4&content-id=amzn1.sym.05f2af66-0e0d-4841-9496-1d7a185265a8:amzn1.sym.05f2af66-0e0d-4841-9496-1d7a185265a8&pf_rd_p=05f2af66-0e0d-4841-9496-1d7a185265a8&pf_rd_r=60YQ1QXDEHVJ7QJ35HDX&pd_rd_wg=AmyKM&pd_rd_r=e29c3f68-963f-4c7e-afaa-07270655ab0d&qid=1733765301&ref_=sxts_spks_0_0_05f2af66-0e0d-4841-9496-1d7a185265a8
Fetching product 2 of 97: https://www.amazon.in//Sony-MDR-ZX110A-Stereo-Headphones-without/dp/B00KGZZ824/ref=sr_1_1?crid=2LITCFDFEKL0P&dib=eyJ2IjoiMSJ9.znjszWxIDYu_N_NGL7gzq-AWX8iQ9EabBuFHNI-PbwqN4u2NSc1E1-nUeDdfrH2uOnSZf3LNYJk1_ph29ZSW2awotDyYwEMggeeYKcQG3b3sjwIBN1q0bQWj_g5LloFRYY0LL5caMLgh8i7KnKi9DvBqw2yF28Lv4A6hBcqWtYRfw1Yt5Rvete0V704aU1whrzolN51P182MaLrXOxd4m8vABb1Q8QYPxvmY5UuyCqVh717X5kgRq4b6YJZB2EtyQmCh0duYOY5l5Epqs2ZJ9A12zP0K8b4sbEo8YClrdGE.Xu6qrmbIgaHZCwfPBkHDK5bO5dUWC_7BX1mj7DWZabI&dib_tag=se&keywords=headphones&qid=1733765301&refinements=p_123%3A237204&rnid=91049095031&s=electronics&sprefix=

#### Converting the product data to a dataframe for better view and understanding

In [79]:
product_details = pd.DataFrame(all_products_data)

In [80]:
product_details.head()

Unnamed: 0,Product Link,Name,Original MRP,MRP After Discount,Ratings,Number of Ratings,Ratings Link,Sold By
0,https://www.amazon.in//b?node=20930687031&pd_r...,,,,,,,
1,https://www.amazon.in//Sony-MDR-ZX110A-Stereo-...,Sony MDR-ZX110A On-Ear Stereo Headphones (Whit...,"₹1,390",750.0,4.0,"65,826 ratings",https://www.amazon.in#customerReviews,Clicktech Retail Private Ltd
2,https://www.amazon.in//Sony-Bluetooth-Headphon...,"Sony WH-CH520, Wireless On-Ear Bluetooth Headp...","₹5,990",4488.0,4.2,"15,950 ratings",https://www.amazon.in#customerReviews,Clicktech Retail Private Ltd
3,https://www.amazon.in//Sony-Mdr-Zx310Ap-Wired-...,Sony Mdr-Zx310Ap Wired On Ear Headphones with ...,"₹2,190",1161.0,4.0,333 ratings,https://www.amazon.in#customerReviews,Clicktech Retail Private Ltd
4,https://www.amazon.in//Sony-Bluetooth-Headphon...,"Sony WH-CH520, Wireless On-Ear Bluetooth Headp...","₹5,990",4269.0,4.2,"15,950 ratings",https://www.amazon.in#customerReviews,Clicktech Retail Private Ltd
