### Download images from the World Steel Association website

In [None]:
#import packages to scrap data from web
import requests #package for send requests to web and download html contents
from bs4 import BeautifulSoup #package for parsing html contents
from tqdm import tqdm #package for progress bar

#ssl warining off
requests.packages.urllib3.disable_warnings()

In [None]:
url = "https://worldsteel.org/about-steel/lovesteel/" #url to scrap
res = requests.get(url, verify=False) #send request to web server and get response

In [None]:
soup = BeautifulSoup(res.text, "html.parser") #parse html contents
soup #print parsed html contents

In [None]:
soup.select(".fact-box > img") #check image elements(=target to scrap) with css selector

In [None]:
#save images to local directory using for loop
for image in soup.select(".fact-box > img"):
    img_url = image["data-src"] #get image url
    img_name = img_url.split("/")[-1] #get image name
    img_data = requests.get(img_url, verify=False) #get image data
    open(img_name, "wb").write(img_data.content) #save file to write binary image data

### Scraping HRM(Human Resource Management) glossary data from SHRM website

In [None]:
import pandas as pd

#set url to scrap : SHRM HR Glossary page
url_for_list = "https://www.shrm.org/ResourcesAndTools/tools-and-samples/HR-Glossary/_layouts/15/SHRM.Core/ajax/AutomatedViewViaSPS.aspx"

#set parameters to scrap
params = {
    "ItemUniqueId": "4b37a8f9-2314-477f-ae7a-956782aeef9b",
    "Page": "0",
    "PageSize": "200",
    "PageView": "List",
    "AdSection": "SectionFrontAutomatedBox",
    "AdSectionMobile": "SectionFrontAutomatedBoxMobile",
    "AdSize1W": "300",
    "AdSize1H": "250",
    "AdsCount": "1",
    "TimeStamp": "638348396995574667",
    "Random": "0.5497595531528323"
}

#send request to web server and get response
res = requests.get(url_for_list, params=params, verify=False)

#parse html contents
soup = BeautifulSoup(res.text, "html.parser")

#get glossay word elements list with css selector
word_list = soup.select("#pan_Items > div")

#check length of word list
print("total cnt : {}".format(len(word_list)))

In [None]:
#assign empty list to save glossary data
glossary_list = []

#to stop loop, assign 0 to cnt
cnt = 0

#loop for word list
for word in tqdm(word_list):
    #if each word has no link, skip
    if len(word.select("a")) == 0:
        continue
    word_name = word.select("a")[0].text #get word name
    word_link = word.select("a")[0]["href"] #get word link
    if "https" in word_link: #if word link is absolute url, try to get word description
        res = requests.get(word_link, verify=False) #send request to web server and get response
        soup = BeautifulSoup(res.text, "html.parser") #parse html contents
        desc = soup.select(".shrm-Element-P") #find and get word description
        if len(desc) == 0: #if word description is empty, skip
            continue
        word_desc = soup.select(".shrm-Element-P")[0].text.strip() #get word description

        #append word name, link, description to glossary list
        glossary_list.append({ "name" : word_name, "link" : word_link, "desc" : word_desc })

    #increase cnt
    cnt += 1

    #if cnt is 30, break loop -> to scrap only 30 words for test
    #if you want to scrap all words, remove below if statement
    if cnt == 30:
        break
    
#convert glossary list to dataframe
df = pd.DataFrame(glossary_list)
df

In [None]:
#convert dataframe to excel file
df.to_excel("glossary.xlsx", index=False)