<a href="https://colab.research.google.com/github/kavyakapoor200/python-scrapper/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 pandas




In [None]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

# Function to extract the Product Title
def get_title(soup):
    try:
        return soup.find("span", attrs={"id": "productTitle"}).text.strip()
    except AttributeError:
        return "N/A"

#  Function to extract Product Price
def get_price(soup):
    try:
        return soup.find("span", attrs={"class": "a-price-whole"}).text.strip()
    except AttributeError:
        try:
            return soup.find("span", attrs={"id": "priceblock_dealprice"}).text.strip()
        except AttributeError:
            return "N/A"

#  Function to extract Product Rating
def get_rating(soup):
    try:
        return soup.find("span", attrs={"class": "a-icon-alt"}).text.strip()
    except AttributeError:
        try:
            return soup.find("span", attrs={"class": "a-declarative"}).text.strip()
        except AttributeError:
            return "N/A"

#  Function to extract Seller Name
def get_seller_name(soup):
    try:
        return soup.find("a", attrs={"id": "bylineInfo"}).text.strip()
    except AttributeError:
        try:
            return soup.find("div", attrs={"id": "merchant-info"}).text.strip()
        except AttributeError:
            return "N/A"

#  Function to handle HTTP request retries
def safe_request(url, headers):
    """Handles errors when making HTTP requests"""
    for _ in range(3):  # Try 3 times before giving up
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            time.sleep(2)  # Wait before retrying
        except requests.exceptions.RequestException:
            time.sleep(2)
    return None  # Return None if all attempts fail

#  Function to scrape product links from Amazon search pages
def get_product_links(page_num):
    """Extracts all product links from an Amazon search results page"""
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.90 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    BASE_URL = f"https://www.amazon.in/s?rh=n%3A6612025031&page={page_num}"
    response = safe_request(BASE_URL, HEADERS)

    if response is None:
        print(f" Skipping page {page_num}, request failed!")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    product_links = [
        "https://www.amazon.in" + link.get("href")
        for link in soup.find_all("a", attrs={"class": "a-link-normal s-no-outline"})
    ]

    return product_links

#  Function to scrape product details
def scrape_product_data(product_url):
    """Extracts details of a single product"""
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.90 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    product_response = safe_request(product_url, HEADERS)
    if product_response is None:
        return ["N/A", "N/A", "N/A", "N/A"]  # Skip product if request fails

    product_soup = BeautifulSoup(product_response.content, "html.parser")

    return [
        get_title(product_soup),
        get_price(product_soup),
        get_rating(product_soup),
        get_seller_name(product_soup)
    ]

#  Scrape all 81 pages efficiently
final_data = {"Product Name": [], "Price": [], "Rating": [], "Seller Name": []}

for page in range(1, 82):  # Scraping pages 1 to 81
    print(f" Scraping page {page}...")

    product_links = get_product_links(page)

    for product_url in product_links:
        product_data = scrape_product_data(product_url)

        # Append scraped data
        final_data["Product Name"].append(product_data[0])
        final_data["Price"].append(product_data[1])
        final_data["Rating"].append(product_data[2])
        final_data["Seller Name"].append(product_data[3])

        time.sleep(1)  # Prevent getting blocked

    # Save intermediate results every 5 pages
    if page % 5 == 0:
        pd.DataFrame(final_data).to_csv("amazon_products_partial.csv", index=False)
        print(" Intermediate data saved!")

#  Final Save to CSV
df = pd.DataFrame(final_data)
df.to_csv("amazon_products.csv", index=False)
print(" Data scraping complete! Data saved to amazon_products.csv")


 Scraping page 1...
 Skipping page 1, request failed!
 Scraping page 2...
 Skipping page 2, request failed!
 Scraping page 3...
 Skipping page 3, request failed!
 Scraping page 4...
 Skipping page 4, request failed!
 Scraping page 5...
 Skipping page 5, request failed!
 Intermediate data saved!
 Scraping page 6...
 Scraping page 7...
 Scraping page 8...
 Skipping page 8, request failed!
 Scraping page 9...
 Scraping page 10...
 Skipping page 10, request failed!
 Intermediate data saved!
 Scraping page 11...
 Skipping page 11, request failed!
 Scraping page 12...
 Skipping page 12, request failed!
 Scraping page 13...
 Skipping page 13, request failed!
 Scraping page 14...
 Skipping page 14, request failed!
 Scraping page 15...
 Skipping page 15, request failed!
 Intermediate data saved!
 Scraping page 16...
 Scraping page 17...
 Skipping page 17, request failed!
 Scraping page 18...
 Skipping page 18, request failed!
 Scraping page 19...
 Skipping page 19, request failed!
 Scraping pag