<a href="https://colab.research.google.com/github/id-shiv/knowledge_base/blob/master/%5BProject_301%5D_Web_Scraping_with_Beautiful_Soup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [0]:
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

import re
import requests

# Get HTML Page for processing

In [0]:
url = 'https://www.dell.com/en-in/work/shop/povw/openmanage-microsoft'

In [0]:
web_page = urlopen(url=url)
html_page = web_page.read()

In [0]:
page = soup(html_page, 'html.parser')

# Header Text

In [6]:
print(page.h1.text)

Dell EMC OpenManage Integrations for Microsoft System Center


# Paragraph

In [7]:
parah = page.p.text.strip()
parah = re.sub(r'\n+', '\n', parah)
print(parah)

Fiscal Year-End Sale is live now! Savings up to ₹24,999, Free Canon Wi-Fi Printer and 10% Cashback.*. *T&C Apply.
                                            
Discover the offers now
|Question? Call 1800 425 2057 or Click to Chat


# Problems

## Get all Software Download links

In [8]:
# Get all sections
sections = page.find_all('h2')
for section in sections:
  if section.text == 'Software Downloads':
    sw_downloads = section.find_next('p')
    for product in sw_downloads.find_all('a', href=True):
      if product.text[0] != '/':
        print(f'Product : {product.text}\nLink : {product["href"].replace("//", "").replace("www.", "")}')

Product : OpenManage Integration for Microsoft System Center for System Center Operations Manager - Dell Server Management Pack Suite 7.1
Link : dell.com/support/home/us/en/19/Drivers/DriversDetails?driverid=GCVKH
Product : OpenManage Integration for Microsoft System Center v7.1.1 for System Center Operations Manager - Dell EMC Server Management Pack Suite 7.1.1 update, service pack for SCOM 2019
Link : dell.com/support/home/us/en/19/Drivers/DriversDetails?driverid=xpm43
Product : Dell PowerVault MD-Storage Management Pack Suite 6.1
Link : dell.com/support/home/us/en/19/Drivers/DriversDetails?driverid=RGN8T
Product : Dell EqualLogic Management Pack Suite 6.0
Link : dell.com/support/home/us/en/04/Drivers/DriversDetails?driverId=0328N
Product : Dell Printer Management Pack 6.0
Link : dell.com/support/home/us/en/555/Drivers/DriversDetails?driverId=87YW7
Product : Dell Server Deployment Pack Suite 4.1
Link : dell.com/support/home/us/en/19/Drivers/DriversDetails?driverid=26WVJ
Product : Del

## Get Hyper-links

In [0]:
url = 'https://downloads.dell.com/catalog'

In [0]:
def protocol(url):
    return url.split(':')[0] + ':'

def domain_name(url):
    try:
        domain_results = __sub_domain_name(url).split('.')
        return domain_results[-2] + '.' + domain_results[-1]
    except:
        return None

def __sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return None
        
def get_hyper_links(url):
    hyper_links = {}
    page_source = requests.get(url)
    page_source_text = page_source.text
    page = soup(page_source_text, features="lxml")
    for link in page.find_all('a'):
        text = link.text
        href_sub = link.get('href')
        if href_sub:
            if domain_name(url) != domain_name(href_sub):
                href = url + href_sub
            else:
                href = protocol(url) + href_sub
            if text:
                text = text.strip()
                hyper_links[text] = href
            else:
                hyper_links['No Link Text'] = href
        else:
            text = text.strip()
            hyper_links[text] = 'No link'
    return hyper_links

In [11]:
get_hyper_links(url)

{'ASHCI-Catalog.xml.gz': 'https:/catalog/ASHCI-Catalog.xml.gz',
 'ASHCI-Catalog.xml.gz.sha512.sign': 'https:/catalog/ASHCI-Catalog.xml.gz.sha512.sign',
 'Bundle.xsd': 'https:/catalog/Bundle.xsd',
 'Catalog.cab': 'https:/catalog/Catalog.cab',
 'Catalog.gz': 'https:/catalog/Catalog.gz',
 'Catalog.gz.sha512.sign': 'https:/catalog/Catalog.gz.sha512.sign',
 'Catalog.gz.sign': 'https:/catalog/Catalog.gz.sign',
 'Catalog.xml.gz': 'https:/catalog/Catalog.xml.gz',
 'Catalog.xml.gz.sha512.sign': 'https:/catalog/Catalog.xml.gz.sha512.sign',
 'Catalog.xml.gz.sign': 'https:/catalog/Catalog.xml.gz.sign',
 'CatalogIndex.gz': 'https:/catalog/CatalogIndex.gz',
 'CatalogIndex.gz.sha512.sign': 'https:/catalog/CatalogIndex.gz.sha512.sign',
 'CatalogIndex.gz.sign': 'https:/catalog/CatalogIndex.gz.sign',
 'CatalogIndexPC.cab': 'https:/catalog/CatalogIndexPC.cab',
 'CatalogPC.cab': 'https:/catalog/CatalogPC.cab',
 'DRMVersion.tar.gz': 'https:/catalog/DRMVersion.tar.gz',
 'DRMVersion.tar.gz.sign': 'https:/cat

## Dell EMC OpenManage Product Checksum

In [0]:
url = 'https://www.dell.com/support/home/in/en/inbsdt1/drivers/driversdetails?driverid=29wtg'
web_page = urlopen(url=url)
html_page = web_page.read()
page = soup(html_page, 'html.parser')

In [34]:
file_name = ''

containers = page.find_all('div', {'class' : 'my-5'})
description_container = page  # set the description element to page to start with
for row in containers[0].find_all('div', {'class' : 'row'}):
  try:
    if row.b.text == 'File Name:':
      file_name = row.span.text
    if row.b.text == 'Format Description:':
      description_container = row
  except BaseException as be:
    pass

checksum = {}
for row in description_container.find_all('div', {'class' : 'row'}):
  info = row.find_all('div')
  checksum[info[0].text.strip().replace(':', '')] = info[1].text.strip()

print(file_name)
print(checksum)

Dell_EMC_OpenManage_Integration_MS_WAC_1.0.0_133_A00.zip
{'MD5': '07a21475eda29ca618729502992c162d', 'SHA1': '4283d7ddf49eec99b8e960a050b7c3d1fb6edf7e', 'SHA-256': '77ab0418062045050c818d3e02865cf1700842bb14c6a15a514b0a41c297752f'}
