get hyperlinks from one page

In [50]:
'https://registry.hkust.edu.hk/#main-content' in all_absolute_link

True

In [3]:
url = "https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust"
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    print(f"Scraping content from: {url}")
    # You can process and save the content here if needed
    # For example: content = soup.get_text()
    
    links = soup.find_all('a', href=True)
    all_absolute_link = []
    for link in links:
        absolute_link = urljoin(url, link['href'])
        all_absolute_link.append(absolute_link)
        print(absolute_link)

Scraping content from: https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust
https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust#main-content
https://www.ust.hk/news
https://www.ust.hk/academics/list
https://www.ust.hk/lifehkust
http://library.ust.hk/
https://www.ust.hk/map-directions
https://hkustcareers.ust.hk
https://facultyprofiles.ust.hk/
https://www.ust.hk/about
https://registry.hkust.edu.hk/about-aro
https://registry.hkust.edu.hk/faculty-staff
https://registry.hkust.edu.hk/glossary/glossary-terms
https://registry.hkust.edu.hk/contact-us
https://sisprod.psft.ust.hk/SISPROD/signon.html
https://www.ust.hk
https://registry.hkust.edu.hk/
https://registry.hkust.edu.hk/resource-library
https://registry.hkust.edu.hk/useful-tools
https://registry.hkust.edu.hk/quality-assurance-enhancement/overview
https://registry.hkust.edu.hk/key-dates/upcoming-key-dates
https://www.ust.hk
https://registry.hkust.edu.hk/
https://registry.hkust.edu.hk/resource-l

### Verticle scrape

In [14]:
class Webpage:
    def __init__(self, url, path_to_top):
        self.url = url
        self.path_to_top = path_to_top

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def scrape_links(url, all_webpages=[], max_depth=3, path_to_top=[], last_layer_url=""):
    if len(path_to_top) > max_depth:
        return all_webpages
    
    response = requests.get(url)
    if response.status_code != 200:
        return all_webpages

    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)
    path_to_top.append(url)

    print("\n")
    print(f"layer depth: {len(path_to_top)}")
    print(f"this url: {url}")
    print(f"last_layer_url: {last_layer_url}")
    print(f"path_to_top: {path_to_top}")
    
    for link in links:
        absolute_link = urljoin(url, link['href'])
        if '#main-content' in absolute_link:
            continue
        if len(all_webpages) > 1000:
            break
        if absolute_link not in [webpage.url for webpage in all_webpages]:
            print(f"Found link: {absolute_link}")
            webpage = Webpage(absolute_link, path_to_top.copy())
            all_webpages.append(webpage)
            scrape_links(absolute_link, all_webpages, max_depth, path_to_top.copy(),
                         last_layer_url=url)

    return all_webpages

In [9]:
# Starting URL
start_url = 'https://registry.hkust.edu.hk/'
# Scraping links
webpages = scrape_links(start_url, max_depth=3)



layer depth: 1
this url: https://registry.hkust.edu.hk/
last_layer_url: 
path_to_top: ['https://registry.hkust.edu.hk/']
Found link: https://www.ust.hk/news


layer depth: 2
this url: https://www.ust.hk/news
last_layer_url: https://registry.hkust.edu.hk/
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news']
Found link: https://www.ust.hk/news/announcements


KeyboardInterrupt: 

In [67]:
for i, webpage in enumerate(webpages):
    print(f"#{i}")
    print(f"url: {webpage.url}")
    print(f"path_to_top: {webpage.path_to_top}")

#0
url: https://www.ust.hk/news
path_to_top: ['https://registry.hkust.edu.hk/']
#1
url: https://www.ust.hk/news/announcements
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news']
#2
url: https://www.ust.hk/student
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news', 'https://www.ust.hk/news/announcements']
#3
url: https://www.ust.hk/faculty-and-staff
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news', 'https://www.ust.hk/news/announcements', 'https://www.ust.hk/student']
#4
url: https://www.ust.hk/media
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news', 'https://www.ust.hk/news/announcements', 'https://www.ust.hk/student']
#5
url: https://hkust-gz.edu.cn/
path_to_top: ['https://registry.hkust.edu.hk/', 'https://www.ust.hk/news', 'https://www.ust.hk/news/announcements', 'https://www.ust.hk/student']
#6
url: https://shaw-auditorium.hkust.edu.hk/
path_to_top: ['https://registry.hkust.edu.hk/', 'http

### Paralell scrape

In [6]:
class Webpage:
    def __init__(self, url, path_to_top):
        self.url = url
        self.path_to_top = path_to_top

In [14]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_links2(start_url, max_depth=3):
    all_webpages = []
    seen_url = []
    que = [Webpage(start_url,[])]
    
    for depth in range(max_depth + 1):
        next_que = []
        indent = ""
        for i in range(depth+1):
            indent+= "   "
        for i, current_page in enumerate(que):
            if len(all_webpages) > 1000:
                break
            
            current_url = current_page.url
            path_to_top = current_page.path_to_top + [current_url]
            try:
                response = requests.get(current_url)
            except:
                continue
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a', href=True)
            link_count = 0
            print("\n")
            print(f"{indent}layer depth: {depth}")
            print(f"{indent}path_to_top: {path_to_top}")
            print(f"{indent}Total links: {len(all_webpages)}  Que: {len(que)-i}")
            for link in links:
                absolute_link = urljoin(current_url, link['href'])
                if '#main-content' in absolute_link:
                    continue
                if "resource-library" not in absolute_link:
                    continue
                if link_count > 10:
                    break
                if absolute_link not in seen_url:
                    print(f"{indent}    Found #{link_count} link: {absolute_link}")
                    webpage = Webpage(absolute_link, path_to_top)
                    all_webpages.append(webpage)
                    next_que.append(webpage)
                    seen_url.append(absolute_link)
                    link_count+=1

        que = next_que

    return all_webpages

In [17]:
# Starting URL
start_url = 'https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust'
# Scraping links
webpages = scrape_links2(start_url, max_depth=5)



   layer depth: 0
   path_to_top: ['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
   Total links: 0  Que: 1
       Found #0 link: https://registry.hkust.edu.hk/resource-library
       Found #1 link: https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust
       Found #2 link: https://registry.hkust.edu.hk/resource-library/academic-regulations-governing-ug-studies
       Found #3 link: https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust#pages
       Found #4 link: https://registry.hkust.edu.hk/resource-library/program-catalog
       Found #5 link: https://registry.hkust.edu.hk/resource-library/course-catalog
       Found #6 link: https://registry.hkust.edu.hk/resource-library/undergraduate-student-guide
       Found #7 link: https://registry.hkust.edu.hk/resource-library/academic-honor-code-and-academic-integrity
       Found #8 link: https://registry.hkust.edu.hk/resource-library/academic-achievement-medal
     

In [23]:
for i, webpage in enumerate(webpages):
    print(f"# {i}")
    print(webpage.url)
    print(webpage.path_to_top)
    print(" ")


# 0
https://registry.hkust.edu.hk/resource-library
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 1
https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 2
https://registry.hkust.edu.hk/resource-library/academic-regulations-governing-ug-studies
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 3
https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust#pages
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 4
https://registry.hkust.edu.hk/resource-library/program-catalog
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 5
https://registry.hkust.edu.hk/resource-library/course-catalog
['https://registry.hkust.edu.hk/resource-library/undergraduate-studies-hkust']
 
# 6
https://registry.hkust.edu.hk/resource-library/undergraduate-student-

Save webpages links

In [19]:
import pickle
with open('hkust_resource-library_links.pkl', 'wb') as f:  # open a text file
    pickle.dump(webpages, f) # serialize the list

In [21]:
with open('hkust_resource-library_links.pkl', 'rb') as f:
    webpages = pickle.load(f)
