In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [4]:
#Function to extract Brand name
def get_brand(soup):

    try:
        brand = soup.find("tr", attrs={'class':'a-spacing-small po-brand'}).text.replace('Brand','').strip()

    except AttributeError:
        brand = ""

    return brand
    
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'a-offscreen'}).text.strip()

    except AttributeError:
        price = ""

    return price

#Function to extract feature 
def get_feature(soup):

    try:
        label = soup.find('span', string='Filter Type')
        feature = label.find_next('span', class_='a-size-base handle-overflow').text.strip()
        
    except AttributeError:
        feature = ""

    return feature

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count


In [None]:
if __name__ == '__main__':

    # add your user agent 
        HEADERS = ({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",'Accept-Language':'en-US, en;q=0.5'})

    # The webpage URL
    URL = 'https://www.amazon.in/s?k=air+purifier+for+bedroom&rh=p_72%3A1318476031&dc&crid=1PLUGLCS6W9O2&qid=1754391955&rnid=1318475031&sprefix=air+purifier+for+bedroom%2Caps%2C290&ref=sr_nr_p_72_1&ds=v1%3AUZEN1LydFH6Jj%2B7ks35touXvZqcwwTh00ZIN%2Bhrisd8'
        
    d = {"Brand":[],"Title":[], "Price":[],"Feature":[], "Rating":[], "Reviews":[]}

    
    for page in range(1, 20):  # Scrape first 5 pages; increase range as needed
        print(f"Scraping page {page}...")
        url = f"https://www.amazon.in/s?k=air+purifier+for+bedroom&rh=p_72%3A1318476031&dc&crid=1PLUGLCS6W9O2&qid=1754391955&rnid=1318475031&sprefix=air+purifier+for+bedroom%2Caps%2C290&ref=sr_nr_p_72_1&ds=v1%3AUZEN1LydFH6Jj%2B7ks35touXvZqcwwTh00ZIN%2Bhrisd8&page={page}"
        
        try:
            webpage = requests.get(url, headers=HEADERS)
            soup = BeautifulSoup(webpage.content, "html.parser")

            links = soup.find_all('a', attrs={
                'class': "a-link-normal s-line-clamp-2 s-line-clamp-3-for-col-12 s-link-style a-text-normal"
            })

            links_list = [link.get('href') for link in links]

            for link in links_list:
                product_url = "https://www.amazon.in" + link
                try:
                    new_webpage = requests.get(product_url, headers=HEADERS)
                    new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                    d['Brand'].append(get_brand(new_soup))
                    d['Title'].append(get_title(new_soup))
                    d['Price'].append(get_price(new_soup))
                    d['Feature'].append(get_feature(new_soup))
                    d['Rating'].append(get_rating(new_soup))
                    d['Reviews'].append(get_review_count(new_soup))

                    time.sleep(1)  # Delay to reduce risk of blocking
                except Exception as e:
                    print(f"Failed to scrape product page: {product_url} - {e}")
                    continue

        except Exception as e:
            print(f"Failed to fetch search results page {page}: {e}")
            continue

    Amazon_df = pd.DataFrame.from_dict(d)
    Amazon_df['Title'].replace('', np.nan, inplace=True)
    Amazon_df = Amazon_df.dropna(subset=['Title'])
    Amazon_df.to_csv("Amazon_data_v2.csv", header=True, index=False)
    print("Scraping complete. Data saved to Amazon_data.csv.")

In [None]:
Amazon_df

Unnamed: 0,Brand,Title,Price,Feature,Rating,Reviews
0,Coway,Coway Airmega 150 (AP-1019C) Air Purifier For ...,"₹14,999.00",Special Green Anti Virus True HEPA,4.4 out of 5 stars,"6,075 ratings"
1,Honeywell,"Honeywell Air Purifier for Home & Office, 3-in...","₹4,985.00",HEPA,4.0 out of 5 stars,"10,330 ratings"
2,LEVOIT,LEVOIT Core Mini Air Purifier For Coverage Are...,"₹5,499.00",HEPA,4.4 out of 5 stars,"85,117 ratings"
3,Honeywell,"Honeywell Air Purifier for Home & Office, 3-in...","₹4,985.00",HEPA,4.0 out of 5 stars,"10,330 ratings"
4,Philips,Philips AC0920 Smart Air Purifier for Home| Re...,"₹7,999.00",HEPA,4.1 out of 5 stars,"3,114 ratings"
...,...,...,...,...,...,...
413,KYARI,"KYARI Jade Plant, Spider Plant, Oyster Plant, ...","₹1,082.00",,4.0 out of 5 stars,2 ratings
414,VENDFO,"VENDFO Halloween Air Purifier for Home, Pumpki...","₹10,008.00",HEPA,4.4 out of 5 stars,20 ratings
415,TDBYWAE,"Air Purifiers for Home, TDBYWAE Air Purifiers ...","₹7,820.00",HEPA,4.2 out of 5 stars,48 ratings
416,CPENSUS,CPENSUS Air Purifier For Bedroom Coverage 107 ...,"₹3,086.00",HEPA,4.2 out of 5 stars,58 ratings
