In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# Step 1: Perform a Google search and extract the top 10 result URLs
def get_google_search_results(query):
    url = f"https://www.google.com/search?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    # Send a GET request to Google search
    response = requests.get(url, headers=headers)
    
    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the top 10 results
    result_divs = soup.find_all('div', class_='tF2Cxc', limit=10)
    
    urls = []
    for result in result_divs:
        link = result.find('a')['href']
        urls.append(link)
    
    return urls

In [4]:
# Step 2: Scrape each URL for text, H1, H2, H3 elements
def scrape_page(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract text from <p> tags
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        
        # Extract H1, H2, H3 headers
        h1 = [h1.get_text() for h1 in soup.find_all('h1')]
        h2 = [h2.get_text() for h2 in soup.find_all('h2')]
        h3 = [h3.get_text() for h3 in soup.find_all('h3')]
        
        return {
            'url': url,
            'text': text,
            'h1': h1,
            'h2': h2,
            'h3': h3
        }
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

In [5]:
# Step 3: Combine the Google search and page scraping
def main(query):
    # Get the top 10 search results from Google
    urls = get_google_search_results(query)
    
    # Scrape each URL
    for url in urls:
        print(f"Scraping {url} ...")
        page_data = scrape_page(url)
        
        if page_data:
            print(f"Scraped data from {url}:")
            print(f"H1 tags: {page_data['h1']}")
            print(f"H2 tags: {page_data['h2']}")
            print(f"H3 tags: {page_data['h3']}")
            print(f"Text preview: {page_data['text'][:200]}...\n")
        else:
            print(f"Skipping {url} due to an error.")

# Example usage
main("web scraping tutorial")

Scraping https://www.geeksforgeeks.org/python-web-scraping-tutorial/ ...
Scraped data from https://www.geeksforgeeks.org/python-web-scraping-tutorial/:
H1 tags: ['Python Web Scraping Tutorial']
H2 tags: ['Introduction to Web Scraping', 'Basics of Web Scraping', 'Setting Up the Environment', 'Extracting Data from Web Pages', 'Fetching Web Pages', 'HTTP Request Methods', 'Searching and Extract for specific tags Beautifulsoup', 'Scrapy Basics', 'Selenium Python Basics', ' Essential Packages and Tools for Python Web Scraping ', ' Requests Module ', ' BeautifulSoup Library ', ' Selenium ', ' Lxml ', ' Urllib Module ', ' PyautoGUI ', ' Schedule ', ' Why Python3 for Web Scraping? ', ' Conclusion ', 'What kind of Experience do you want to share?']
H3 tags: [' Example: Making a Request ', ' Example ', ' Finding Elements by Class ', ' Example  1: For Firefox ', ' Example 2: For Chrome ', ' Example ', ' Example ', ' Example ', ' Example ', 'Similar Reads', 'Please Login to comment...']
Text previ