<a href="https://colab.research.google.com/github/kashifkhan9555/web-scrapping-with-beautiful-soap/blob/main/web_scrapping_with_beautiful_soap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [15]:
import csv
import requests
from bs4 import BeautifulSoup
import time
import random

def scrape_webpage(url, headers):
    retries = 3
    for _ in range(retries):
        try:
            # Fetch HTML content from the URL
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes
            html_content = response.text

            # Parse HTML using BeautifulSoup
            soup = BeautifulSoup(html_content, "html.parser")

            # Extract relevant information
            text_content = soup.get_text().replace("\n", " ").replace(",", " ")
            image_urls = [img["src"] for img in soup.find_all("img", src=True)]
            links = [link["href"] for link in soup.find_all("a", href=True)]

            # Store scraped data in a structured format (dictionary)
            data = {
                "URL": url,
                "Text Content": text_content,
                "Image URLs": ", ".join(image_urls),
                "Links": ", ".join(links)
            }
            return data
        except requests.exceptions.HTTPError as e:
            print(f"Failed to fetch HTML content from {url}. Status code: {response.status_code}")
            if response.status_code in [403, 429, 520]:
                # Implement a backoff strategy for 403 Forbidden, 429 Too Many Requests, and 520 Origin Error
                if _ == retries - 1:
                    print(f"Skipping {url} after maximum retries.")
                    return None
                print(f"Retrying {url} in 60 seconds...")
                time.sleep(60)
            else:
                break
        except (requests.exceptions.RequestException, Exception) as e:
            print(f"An error occurred while scraping {url}: {e}")
            if _ == retries - 1:
                print(f"Skipping {url} after maximum retries.")
                return None
            # Implement a backoff strategy for other exceptions
            print(f"Retrying {url} in {2 ** _ + random.uniform(0, 1)} seconds...")
            time.sleep(2 ** _ + random.uniform(0, 1))
    return None

def main():
    # List of URLs to scrape
    urls = [
        "https://products.basf.com/global/en/ci/n-vinyl-2-pyrrolidone.html",
        "https://pubchem.ncbi.nlm.nih.gov/compound/N-Vinyl-2-pyrrolidone",
        "https://www.shokubai.co.jp/en/products/detail/nvp/",
        "https://pubchem.ncbi.nlm.nih.gov/compound/N-Vinyl-2-pyrrolidone",
        "https://www.sciencedirect.com/topics/pharmacology-toxicology-and-pharmaceutical-science/1-vinyl-2-pyrrolidinone",
        "https://www.ncbi.nlm.nih.gov/books/NBK498761/#:~:text=It%20is%20used%20in%20the,the%20synthesis%20of%20phenolic%20resins",
        "https://www.sciencedirect.com/topics/agricultural-and-biological-sciences/polyvinylpyrrolidone#:~:text=PVP%20added%20to%20iodine%20forms,trade%20name%20Betadine%20and%20Pyodine",
        "https://www.shokubai.co.jp/en/products/detail/nvp/#:~:text=N%2Dvinylpyrrolidone%20is%20a%20nonionic,monomer%20with%20the%20following%20features.&text=N%2Dvinylpyrrolidone%20is%20used%20as,of%20reactivity%20with%20UV%20irradiation",
        "https://adhesives.specialchem.com/product/m-basf-n-vinyl-pyrrolidone-nvp",
        "https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/",
        "https://pubs.rsc.org/en/content/articlelanding/2019/py/c8py01459k",
        "https://www.science.gov/topicpages/n/n-vinyl+pyrrolidone+nvp",
        "https://shdexiang.en.made-in-china.com/product/tXfQDioPsKVn/China-N-Vinylpyrrolidone-CAS-No-88-12-0-C6h9no.html",
        "https://www.cphi-online.com/nvp-n-vinylpyrrolidone-prod1288298.html",
        "https://www.mdpi.com/2073-4360/11/6/1079"
    ]

    # Rotate user-agent headers
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4"
    ]

    # Scrape data from each URL
    scraped_data = []
    for url in urls:
        headers = {"User-Agent": random.choice(user_agents)}
        data = scrape_webpage(url, headers)
        if data:
            scraped_data.append(data)

    # Save scraped data to a CSV file
    output_file_path = "scraped_data.csv"
    with open(output_file_path, "w", newline="", encoding="utf-8") as csv_file:
        fieldnames = ["URL", "Text Content", "Image URLs", "Links"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        writer.writerows(scraped_data)

    print(f"Scraped data saved to '{output_file_path}' in CSV format.")

if __name__ == "__main__":
    main()

Failed to fetch HTML content from https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/. Status code: 520
Retrying https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/ in 60 seconds...
Failed to fetch HTML content from https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/. Status code: 520
Retrying https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/ in 60 seconds...
Failed to fetch HTML content from https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/. Status code: 520
Skipping https://www.welinkschem.com/nvp-n-vinyl-pyrrolidone/ after maximum retries.
Scraped data saved to 'scraped_data.csv' in CSV format.


In [13]:
import os
print(os.listdir())


['.config', 'scraped_data.csv', 'scraped_data.json', 'drive', 'sample_data']


In [16]:
from google.colab import files
files.download('scraped_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>