# Scrape Website Content

This Notebook can be run to scrape a hierarchical html website and place the raw output into Azure blob storage.

## Install dependencies

## Load credentials

## Set execution parameters

## Perform the scrape 

In [None]:
import requests
from bs4 import BeautifulSoup
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json

# Import necessary libraries

# Define the URL of the website to scrape
url = 'https://example.com'

# Send a GET request to the website
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Function to recursively scrape hierarchical data
def scrape_data(soup):
    data = []
    # Example: Find all items in a hierarchical structure
    for item in soup.find_all('div', class_='item'):
        item_data = {}
        item_data['title'] = item.find('h2').text
        item_data['description'] = item.find('p').text
        # Recursively scrape sub-items if they exist
        sub_items = item.find_all('div', class_='sub-item')
        if sub_items:
            item_data['sub_items'] = scrape_data(BeautifulSoup(str(sub_items), 'html.parser'))
        data.append(item_data)
    return data

# Scrape the data from the website
scraped_data = scrape_data(soup)

# Convert the scraped data to a string (or JSON)
scraped_data_str = json.dumps(scraped_data, indent=4)

# Azure Blob Storage connection string and container name
connection_string = 'your_connection_string'
container_name = 'your_container_name'
blob_name = 'scraped_data.json'

# Create a BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get a ContainerClient
container_client = blob_service_client.get_container_client(container_name)

# Upload the scraped data to Azure Blob Storage
blob_client = container_client.get_blob_client(blob_name)
blob_client.upload_blob(scraped_data_str, overwrite=True)

print("Data scraped and uploaded to Azure Blob Storage successfully.")