<a href="https://colab.research.google.com/github/jahnavi6078/web-scrapping/blob/main/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

# Define your User-Agent string
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"

# Function to scrape product details from a single page
def scrape_page(url):
    headers = {
        "User-Agent": user_agent
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status Code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    product_details = []

    products = soup.find_all("div", class_="s-result-item")
    for product in products:
        product_url_element = product.find("a", class_="a-link-normal s-no-outline")
        if not product_url_element:
            continue  # Skip this product if the URL is not found

        product_url = product_url_element.get("href")

        product_name_element = product.find("span", class_="a-text-normal")
        if not product_name_element:
            continue  # Skip this product if the name is not found
        product_name = product_name_element.text.strip()

        product_price_element = product.find("span", class_="a-price-whole")
        if product_price_element:
            product_price = product_price_element.text
        else:
            product_price = "Not available"

        product_rating_element = product.find("span", class_="a-icon-alt")
        if product_rating_element:
            product_rating = product_rating_element.text
        else:
            product_rating = "Not rated"

        product_reviews_element = product.find("span", {"class": "a-size-base", "aria-label": " customer review"})
        if product_reviews_element:
            product_reviews = product_reviews_element.text
        else:
            product_reviews = "No reviews"

        product_details.append({
            "Product URL": product_url,
            "Product Name": product_name,
            "Product Price": product_price,
            "Rating": product_rating,
            "Number of reviews": product_reviews
        })

    return product_details

# Function to scrape data from multiple pages
def scrape_multiple_pages(base_url, num_pages):
    all_product_details = []

    for page in range(1, num_pages + 1):
        url = f"{base_url}&page={page}"
        product_details = scrape_page(url)

        if product_details:
            all_product_details.extend(product_details)

    return all_product_details

# Define your base URL and the number of pages to scrape
base_url = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2C,aps,283&ref=sr_pg_1"
num_pages = 20  # Set the number of pages to scrape

scraped_data = scrape_multiple_pages(base_url, num_pages)

# Save the data to a CSV file
csv_file = 'amazon_products.csv'

with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = ['Product URL', 'Product Name', 'Product Price', 'Rating', 'Number of reviews']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for product in scraped_data:
        writer.writerow(product)


In [6]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import time

# Define your User-Agent string
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"

# Function to scrape additional product details from a single product page
def scrape_product_details(product_url):
    headers = {
        "User-Agent": user_agent
    }

    # Prepend 'https://' to the URL if it's a relative URL
    if not product_url.startswith("http"):
        product_url = "https://www.amazon.in" + product_url

    retries = 3  # Number of retries
    for _ in range(retries):
        response = requests.get(product_url, headers=headers)
        if response.status_code == 200:
            break
        else:
            print(f"Failed to retrieve the product page. Status Code: {response.status_code}")
            time.sleep(5)  # Wait for a few seconds before retrying

    if response.status_code != 200:
        print("Failed to retrieve the product page after retries.")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')

    product_details = {
        "Description": "",
        "ASIN": "",
        "Product Description": "",
        "Manufacturer": ""
    }

    # Extract additional information here (modify as needed)
    # Example:
    product_title_element = soup.find("span", id="productTitle")
    if product_title_element:
        product_details["Description"] = product_title_element.text.strip()
    else:
        product_details["Description"] = "Not available"

    asin_element = soup.find("th", text="ASIN")
    if asin_element:
        asin_td = asin_element.find_next("td")
        if asin_td:
            product_details["ASIN"] = asin_td.text.strip()
        else:
            product_details["ASIN"] = "Not available"
    else:
        product_details["ASIN"] = "Not available"

    product_description_element = soup.find("div", id="productDescription")
    if product_description_element:
        product_details["Product Description"] = product_description_element.text.strip()
    else:
        product_details["Product Description"] = "Not available"

    manufacturer_element = soup.find("th", text="Manufacturer")
    if manufacturer_element:
        manufacturer_td = manufacturer_element.find_next("td")
        if manufacturer_td:
            product_details["Manufacturer"] = manufacturer_td.text.strip()
        else:
            product_details["Manufacturer"] = "Not available"
    else:
        product_details["Manufacturer"] = "Not available"

    return product_details

# Function to scrape data from multiple product URLs
def scrape_multiple_product_details(product_urls):
    all_product_details = []

    for url in product_urls:
        product_details = scrape_product_details(url)

        if product_details:
            all_product_details.append(product_details)

    return all_product_details

# Load the product URLs from the CSV file created earlier
product_urls = []

with open('amazon_products.csv', 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        product_urls.append(row['Product URL'])

# Scrape additional details for the specified number of product URLs
product_details = scrape_multiple_product_details(product_urls[:200])

# Save the additional data to a CSV file
additional_data_csv = 'additional_product_data.csv'

with open(additional_data_csv, 'w', newline='') as csvfile:
    fieldnames = ["Description", "ASIN", "Product Description", "Manufacturer"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for product in product_details:
        writer.writerow(product)


  asin_element = soup.find("th", text="ASIN")
  manufacturer_element = soup.find("th", text="Manufacturer")


Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page after retries.
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product page. Status Code: 503
Failed to retrieve the product 

In [9]:
!git clone https://github.com/jahnavi6078/web-scrapping.git

Cloning into 'web-scrapping'...
