In [31]:
import requests
import pickle
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque

## Start Scraping from some base url of USF

In [32]:
base_urls = [
    "https://www.usf.edu",
    "https://www.stpetersburg.usf.edu",
    "https://www.sarasotamanatee.usf.edu",
    "https://www.usf.edu/about-usf/site-map.aspx",
    "https://usf.campusdish.com",
]

In [33]:
visited_links = set()
queue = deque()
def get_links_from_url(url):
    links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            # Check if the href is a full URL (either http or https) 
            if href.startswith(("https://")) and 'usf.edu' in href:
                full_url = href
            # If not, try to identify the correct base URL
            else:
                for base in base_urls:
                    if href.startswith(base):
                        full_url = urljoin(base, href)
                        break
                else:
                    # Default to current page's base URL if not matched with known base URLs
                    full_url = urljoin(url, href)
            
            if full_url and full_url not in visited_links and full_url.startswith(("https://")):
                links.append(full_url)
                visited_links.add(full_url)
    except Exception as e:
        print(f"Error fetching {url}: {e}")

    return links

    

def deep_scrape(start_urls):
    all_links = []
    
    for url in start_urls:
        # Add the starting URL to the queue
        queue.append(url)
        visited_links.add(url)
    
    while queue:
        current_url = queue.popleft()
        links = get_links_from_url(current_url)
        all_links.extend(links)
        
        # Add unvisited links to the queue
        for link in links:
            if link not in visited_links:
                queue.append(link)
                visited_links.add(link)
                
    return all_links

## Scraping the first time

In [34]:
urls1 = deep_scrape(base_urls)
print("The number of links found is: ", len(urls1))

303

## Scraping the second time

In [35]:
urls2 = deep_scrape(urls1)
print("The number of links found is: ", len(urls2))

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error fetching https://usf.campusdish.com/ITSupport: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


## Total of links scraped

In [42]:
all_urls = urls1.extend(urls2)
print("The number of unique links found is: ", len(set(all_urls)))

(4244, 4547)

### Perform some tests

In [43]:
def test(urls, test_cases):
    test = test_cases
    numTestCase = 0
    for url in test:
        if url in urls:
            numTestCase += 1
        else:
            print(f"Test case failed: {url}")
    print(f"{numTestCase} out of {len(test)} test cases passed.")

In [48]:
test_cases = ['https://www.usf.edu/about-usf/accreditation.aspx',
'https://www.usf.edu/about-usf',
'https://www.sarasotamanatee.usf.edu/index.aspx',
'https://pandemic-response-research.net/',
'https://www.usf.edu/about-usf/accreditation.aspx',
'https://www.usf.edu/ods/accreditation/accreditation.aspx',
'https://www.usf.edu/about-usf',
'https://www.usf.edu/academics/colleges.aspx',
'https://www.usf.edu/engineering',
'https://www.usf.edu/academics/colleges.aspx',
'https://usf.campusdish.com/ITSupport',
'http://eds.b.ebscohost.com/eds/results?vid=0&sdb=edspub&tid=3000EP%20&bquery=JN+B*+OR+JN+THE+B*+OR+JN+DER+B*+OR+JN+DIE+B*+OR+JN+DAS+B*+OR+JN+LAS+B*+OR+JN+LOS+B*+OR+JN+LES+B*+OR+JN+EL+B*+OR+JN+IL+B*+OR+JN+LA+B*+OR+JN+LE+B*&bdata=JmRiPWVkc3B1YiZ0eXBlPTEmc2l0ZT1lZHMtbGl2ZQ%3d%3d']

In [49]:
test(all_urls, test_cases)

Test case failed: http://eds.b.ebscohost.com/eds/results?vid=0&sdb=edspub&tid=3000EP%20&bquery=JN+B*+OR+JN+THE+B*+OR+JN+DER+B*+OR+JN+DIE+B*+OR+JN+DAS+B*+OR+JN+LAS+B*+OR+JN+LOS+B*+OR+JN+LES+B*+OR+JN+EL+B*+OR+JN+IL+B*+OR+JN+LA+B*+OR+JN+LE+B*&bdata=JmRiPWVkc3B1YiZ0eXBlPTEmc2l0ZT1lZHMtbGl2ZQ%3d%3d
11 out of 12 test cases passed.


## Export the data to a pickle file

In [46]:
def export_urls(all_urls):
    with open("urls.pkl", "wb") as f:
        pickle.dump(all_urls, f)
        print(f"Saved {len(all_urls)} URLs to urls.pkl")

In [47]:
export_urls(all_urls)
print(f"Saved {len(all_urls)} URLs to urls.pkl")

Saved 4547 URLs to urls.pkl
