In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

In [None]:
def get_username(review):
    try:
        username = review.find("span", {"class": "a-profile-name"}).text
    except AttributeError:
        username = ""
    return username

In [None]:
def get_review_text(review):
    try:
        review_text = review.find("span", {"class": "a-size-base review-text"}).find("span").text
    except AttributeError:
        review_text = ""
    return review_text


def get_review_date(review):
    try:
        review_date = review.find("span", {"class": "a-size-base a-color-secondary review-date"}).text
    except AttributeError:
        review_date = ""
    return review_date

In [None]:
def get_rating(review):
    try:
        rating = review.find("span", {"class": "a-icon-alt"}).text
    except AttributeError:
        rating = ""
    return rating


def get_asin(soup):
    try:
        asin = soup.find("div", attrs={'data-asin': True})['data-asin']
    except (AttributeError, TypeError):
        asin = None
    return asin


In [None]:
if __name__ == '__main__':
    # add your user agent
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    BASE_URL = "https://www.amazon.in/s?k=wifi+router+for+home&page={}"

    d = {"Asin":[], "UserName":[], "Rating":[], "Subject":[],"ReviewDate":[], 'Review':[]}

    for page_num in range(1, 21): # Scrape 25 pages to get 5000 reviews
        url = BASE_URL.format(page_num)
        webpage = requests.get(url, headers=HEADERS)

        soup = BeautifulSoup(webpage.content, "html.parser")

        # Find all links to products on the page
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Loop over each product link and extract details
        for link in links:
            product_path = link.get('href')
            if '/dp/' in product_path:
              product_id = product_path.split('/dp/')[1].split('/')[0]
            else:
              continue
            product_url = "https://www.amazon.in/dp/" + product_id
            product_page = requests.get(product_url, headers=HEADERS)
            product_soup = BeautifulSoup(product_page.content, "html.parser")

            # Function calls to display all necessary product information
            d['Asin'].append(get_asin(product_soup))
            d['UserName'].append(get_username(product_soup))
            d['Rating'].append(get_rating(product_soup))
            d['Subject'].append(get_title(product_soup))
            d['ReviewDate'].append(get_review_date(product_soup))
            d['Review'].append(get_review_text(product_soup))


    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['Asin'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['Asin'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)

In [None]:
amazon_df.shape


(205, 6)