In [3]:
import json

import logging
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
def scrape_azure_services(output_filepath):
    assert output_filepath.endswith('.csv')

    AZURE_DOCS_URL = 'https://docs.microsoft.com/en-us/azure/'
    
    soup = BeautifulSoup(requests.get(AZURE_DOCS_URL + '#pivot=products').text, 'html.parser')
    
    azure_services = []
    categories_soup = soup.find('ul', {'id': 'products'}).find_all(
        'a', {'data-linktype': 'self-bookmark'}
    )
    categories_soup = categories_soup[1:]

    for category in categories_soup:
        category_id = category['href'][1:]

        # services = []
        category_soup = soup.find('ul', {'id': category_id})
        
        for link_soup in category_soup.find_all('a'):
            card_soup = link_soup.find('div', {'class': 'card'})
            service_name = card_soup.find('h3').text
            if 'Azure' not in service_name:
                service_name = f'Azure {service_name}'

            href = link_soup['href']
            if not href.startswith('https'):
                link = f"https://docs.microsoft.com{href}"
            
            short_description = card_soup.find('p').text.strip()
            try:
                service_page_soup = BeautifulSoup(requests.get(link).text, 'html.parser')
            except:
                print("Could not access page: ", link)
                print("Skipping")
                continue
                
            try:
                abstract = service_page_soup.find('div', {'class': 'abstract'}).find('p').text
            except:
                try:
                    abstract = service_page_soup.find('main').find('p').text
                except:
                    print('Could not get abstract or initial paragraph describing ', service_name)
                    abstract = short_description
                
            azure_services.append({
                'category_id': category_id,
                'category_name': category.text.strip(),
                'icon': f"{AZURE_DOCS_URL}{card_soup.find('img')['src']}",
                'name': service_name,
                'short_description': short_description,
                'long_description': abstract,
                'link': link
            })

    azure_services_df = pd.DataFrame(azure_services)
    azure_services_df = azure_services_df[[
        'category_id', 'category_name', 'name', 'short_description', 'long_description', 'link', 'icon'
    ]]
    return azure_services_df
#     azure_services_df.to_csv(output_filepath, index=False)
    
s = scrape_azure_services('t.csv')
s

In [5]:
base_url = 'https://azure.microsoft.com'

soup = BeautifulSoup(requests.get(f'{base_url}/services').text, 'html.parser')
products_soup = soup.find('div', {'id': 'products-list'})

In [104]:
services = []

for cat in products_soup.find_all('div', {'class': 'row-size3'}):
    nextNode = cat
    cat_id = cat.find('h2', {'class': 'product-category'})['id']
    cat_name = cat.find('h2', {'class': 'product-category'}).text.strip()
    try:
        cat_link = f"{base_url}{cat.find('a')['href']}"
    except:
        cat_link = ''
    while True:
        nextNode = nextNode.nextSibling.nextSibling

        try:
            class_names = nextNode.get('class')
        except:
            break
        
        if 'row-size2' in class_names:
            names = [h2.text.strip() for h2 in nextNode.find_all('h2')]
            links = [a['href'] for a in nextNode.find_all('a')]
            descs = [p.text.strip() for p in nextNode.find_all('p')]
            
            for i in range(len(names)):
                link = f"{base_url}{links[i]}"
                svc_soup = BeautifulSoup(requests.get(link).text, 'html.parser')
                long_desc = svc_soup.find('meta', {'name': 'description'})['content']
                
                try:
                    docs_btn = svc_soup.find('nav', {'class': 'sub-nav'}).find(text='Documentation').parent
                except:
                    try:
                        docs_btn = svc_soup.find('nav', {'id': 'global-subnav'}).find_all('a', {'class', 'external-link'})[-1]
                    except:
                        try:
                            docs_btn = svc_soup.find('nav', {'class': 'sub-nav'}).find(text='Developer Guide').parent
                        except:
                            continue

                if docs_btn and docs_btn.get('href'):
                    documentation_links = docs_btn['href']
                else:
                    print('no docs link')
                    print(docs_btn.nextSibling)
                    documentation_links = [a['href'] for a in docs_btn.nextSibling.nextSibling.find_all('a')]
                
                services.append({
                    'category_id': cat_id,
                    'category_name': cat_name,
                    'category_link': cat_link,
                    'name': names[i],
                    'link': link,
                    'short_description': descs[i],
                    'long_description': long_desc,
                    'icon': '',
                    'documentation_links': documentation_links
                })
        else:
            break
services

https://azure.microsoft.com/en-us/services/batch-ai/
https://azure.microsoft.com/en-us/services/bot-service/
https://azure.microsoft.com/en-us/services/databricks/
https://azure.microsoft.com/en-us/services/search/
https://azure.microsoft.com/en-us/services/cognitive-services/autosuggest/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-custom-search/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-entity-search-api/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-image-search-api/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-news-search-api/
https://azure.microsoft.com/en-us/services/cognitive-services/spell-check/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-video-search-api/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-visual-search/
https://azure.microsoft.com/en-us/services/cognitive-services/bing-web-search-api/
https://azure.microsoft.com/en-us/services/cogni

[{'category_id': 'ai-machine-learning',
  'category_name': 'AI + Machine Learning',
  'category_link': 'https://azure.microsoft.com/en-us/overview/ai-platform/',
  'name': 'Azure Batch AI',
  'link': 'https://azure.microsoft.com/en-us/services/batch-ai/',
  'short_description': 'Easily experiment and train your deep learning and AI models in parallel at scale',
  'long_description': 'Train Deep Learning and other AI models with Batch AI in parallel, using any framework. Azure Batch AI handles the heavy lifting for easy experimentation, at scale.',
  'icon': '',
  'documentation_links': 'https://go.microsoft.com/fwlink/?LinkId=859748&clcid=0x409'},
 {'category_id': 'ai-machine-learning',
  'category_name': 'AI + Machine Learning',
  'category_link': 'https://azure.microsoft.com/en-us/overview/ai-platform/',
  'name': 'Azure Bot Service',
  'link': 'https://azure.microsoft.com/en-us/services/bot-service/',
  'short_description': 'Intelligent, serverless bot service that scales on demand'

In [17]:
soup = BeautifulSoup(requests.get('https://azure.microsoft.com/en-us/services/machine-learning-service/').text, 'html.parser')

In [27]:
soup.find('main', {'id': 'main'}).find(text='Overview').parent.parent.parent

<div class="row">
<div class="column medium-offset-2 medium-10">
<h2>Overview</h2>
</div>
</div>