In [22]:
import json

import logging
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [25]:
def scrape_azure_services(output_filepath):
    assert output_filepath.endswith('.csv')

    AZURE_DOCS_URL = 'https://docs.microsoft.com/en-us/azure/'
    
    soup = BeautifulSoup(requests.get(AZURE_DOCS_URL + '#pivot=products').text, 'html.parser')
    
    azure_services = []
    categories_soup = soup.find('ul', {'id': 'products'}).find_all(
        'a', {'data-linktype': 'self-bookmark'}
    )
    categories_soup = categories_soup[1:]

    for category in categories_soup:
        category_id = category['href'][1:]

        # services = []
        category_soup = soup.find('ul', {'id': category_id})
        
        for link_soup in category_soup.find_all('a'):
            card_soup = link_soup.find('div', {'class': 'card'})
            service_name = card_soup.find('h3').text
            if 'Azure' not in service_name:
                service_name = f'Azure {service_name}'

            href = link_soup['href']
            if not href.startswith('https'):
                link = f"https://docs.microsoft.com{href}"
            
            short_description = card_soup.find('p').text.strip()
            try:
                service_page_soup = BeautifulSoup(requests.get(link).text, 'html.parser')
            except:
                print("Could not access page: ", link)
                print("Skipping")
                continue
                
            try:
                abstract = service_page_soup.find('div', {'class': 'abstract'}).find('p').text
            except:
                try:
                    abstract = service_page_soup.find('main').find('p').text
                except:
                    print('Could not get abstract or initial paragraph describing ', service_name)
                    abstract = short_description
                
            azure_services.append({
                'category_id': category_id,
                'category_name': category.text.strip(),
                'icon': f"{AZURE_DOCS_URL}{card_soup.find('img')['src']}",
                'name': service_name,
                'short_description': short_description,
                'long_description': abstract,
                'link': link
            })

    azure_services_df = pd.DataFrame(azure_services)
    azure_services_df = azure_services_df[[
        'category_id', 'category_name', 'name', 'short_description', 'long_description', 'link', 'icon'
    ]]
    return azure_services_df
#     azure_services_df.to_csv(output_filepath, index=False)
    
s = scrape_azure_services('t.csv')
s

Unnamed: 0,category_id,category_name,name,short_description,long_description,link,icon
0,Compute,Compute,Azure Linux Virtual Machines,"Provision virtual machines of Ubuntu, Red Hat,...",Azure Linux Virtual Machines provides on-deman...,https://docs.microsoft.com/en-us/azure/virtual...,https://docs.microsoft.com/en-us/azure/media/i...
1,Compute,Compute,Azure Windows Virtual Machines,"Provision virtual machines for SQL Server, Sha...",Azure Windows Virtual Machines provides on-dem...,https://docs.microsoft.com/en-us/azure/virtual...,https://docs.microsoft.com/en-us/azure/media/i...
2,Compute,Compute,Azure App Service,Quickly create powerful cloud apps for web and...,Azure App Service enables you to build and hos...,https://docs.microsoft.com/en-us/azure/app-ser...,https://docs.microsoft.com/en-us/azure/media/i...
3,Compute,Compute,Azure Functions,Process events with serverless code,Azure Functions is a serverless compute servic...,https://docs.microsoft.com/en-us/azure/azure-f...,https://docs.microsoft.com/en-us/azure/media/i...
4,Compute,Compute,Azure Batch,Cloud-scale job scheduling and compute management,Use Batch to run large-scale parallel and high...,https://docs.microsoft.com/en-us/azure/batch/,https://docs.microsoft.com/en-us/azure/media/i...
5,Compute,Compute,Azure Container Instances,Easily run containers with a single command,Azure Container Instances offers the fastest a...,https://docs.microsoft.com/en-us/azure/contain...,https://docs.microsoft.com/en-us/azure/media/i...
6,Compute,Compute,Azure Service Fabric,Develop microservices and orchestrate containe...,Azure Service Fabric is a distributed systems ...,https://docs.microsoft.com/en-us/azure/service...,https://docs.microsoft.com/en-us/azure/media/i...
7,Compute,Compute,Azure Virtual Machine Scale Sets,Manage and scale up to thousands of Linux and ...,Azure virtual machine scale sets let you creat...,https://docs.microsoft.com/en-us/azure/virtual...,https://docs.microsoft.com/en-us/azure/media/i...
8,Compute,Compute,Azure Kubernetes Service (AKS),"Simplify the deployment, management, and opera...",Azure Kubernetes Service (AKS) manages your ho...,https://docs.microsoft.com/en-us/azure/aks/,https://docs.microsoft.com/en-us/azure/media/i...
9,Compute,Compute,Azure Cloud Services,"Create highly-available, infinitely-scalable c...",Learn how to use Cloud Services to host and ru...,https://docs.microsoft.com/en-us/azure/cloud-s...,https://docs.microsoft.com/en-us/azure/media/i...
