## This will be our initial attempt to develop a script to scrape the website of choice.

In [None]:
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_images(url):
    """
    Returns all image URLs on a single `url`
    """
    soup = bs(requests.get(url).content, "html.parser")
    
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        
        # finally, if the url is valid
        if is_valid(img_url):
            urls.append(img_url)
    return urls

def download(url, pathname):
    """
    Downloads a file given an URL and puts it in the folder `pathname`
    """
    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)
    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))
    # get the file name
    filename = os.path.join(pathname, url.split("/")[-1])
    # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
    progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, "wb") as f:
        for data in progress.iterable:
            # write data read to the file
            f.write(data)
            # update the progress bar manually
            progress.update(len(data))

"""
 "main" is the function that is actually scraping and downloading the images off a specified website
 input: a url and a directory path
 output/result: images are downloaded into specific folder
 everything above is used in main
"""           
def main(url, path):
    # get all images
    imgs = get_all_images(url)
    for img in imgs:
        # for each image, download it
        download(img, path)

"""
example of how to run main to download images      
main("https://www.csulb.edu/college-of-education", "C:\\Users\\hanse\\Documents\\Github\\Algorithmic-Bias\\Web-Scraping\\image-folder")
"""

In [4]:
from bs4 import BeautifulSoup
import requests

def get_embedded_url(url):
    """
    extracts all links embedded in 'url' that contain "college-of-education"
    stores the links in the list 'observed_links'
    """
    page = requests.get(url)    
    data = page.text
    soup = BeautifulSoup(data)

    # this is creating the list of urls that are embedded in college of ed homepage that we want to look into
    observed_links = []
    for link in soup.find_all('a'):
        if 'college-of-education' in link.get('href'):
            if 'https://www.csulb.edu' in link.get('href'):
                observed_links.append(link.get('href'))
            elif link.get('href')[:1] == '/':
                https_concat = 'https://www.csulb.edu'+(link.get('href'))
                observed_links.append(https_concat)
    observed_links = list(set(observed_links))
    return observed_links

In [None]:
'''
get all links embedded in "https://www.csulb.edu/college-of-education w/ the condition that 
it has "college-of-education" in the url

store the links in a list, "observed_links"
''' 
observed_links = get_embedded_url("https://www.csulb.edu/college-of-education")

Next block is only for inspection purposes,user can skip to the next block

In [None]:
# display the list of links generated above, what does it "physically" look like?
display(observed_links)

# display the number of links that were extracted using get_embedded_url
display(len(observed_links))

# count the number of "valid" or executable links in the observed
count = 0
for i in observed_links:
    if is_valid(i):
        count += 1
count

### Image Download
The following code block is where I actually begin to download images.
The loop goes through the list of interest developed, "observed_links", above and runs "main" to get the image on the url specified (Note: the url specified is inside the list "observed_links")

In [None]:
for i in range(len(observed_links)):
    main(observed_links[i], "C:\\Users\\hanse\\Documents\\Github\\Algorithmic-Bias\\Web-Scraping\\image-folder")

Next block is only for inspection purposes,user can skip to the next block

In [29]:
# count the files in the folder that was the target of my downloads
dir_path = "C:\\Users\\hanse\\Documents\\Github\\Algorithmic-Bias\\Web-Scraping\\image-folder"
count = 0
# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        count += 1
print('File count:', count)

File count: 82


The process above yields 81 searchable College of Education webpages and 82 image downloads. 1 image is actually repeated 4x so the net total image downloads for the identified webpages is effectively 78.

Next we will try to expand on the list of searchable webpages by running the function "<span style="color:blue">get_embedded_url</span>" for each of the 81 searchable webpages identified above. (Note: The 81 webpages ID'd above are embedded in the College of Education Home page.)

In [6]:
'''
observed_links_lvl2 is a nested list that stores a "level 2" scrape of embedded links.
i.e. for each webpage url in "observed_links" we extract all embedded links that satisfy our criteria.
    - such criteria are provided in the "get_embedded_url" function definition block of code
'''
observed_links_lvl2 = [None]*len(observed_links)
for i in range(len(observed_links)):
    observed_links_lvl2[i] = get_embedded_url(observed_links[i])

Next block is only for inspection purposes,user can skip to the next block

In [None]:
# count the number of links in the nested
for i in range(len(observed_links_lvl2)):
    print(len(observed_links_lvl2[i]))
    
display(observed_links_lvl2[:2])

clean up the "level 2" url scrape data by flattening out the nested list, "observed_links2" and remove repetitions.

In [27]:
# flatten observed_links2
from itertools import chain
lvl2_flat = list(chain.from_iterable(observed_links_lvl2))

# keep only unique urls
lvl2_flat_unique = list(set(lvl2_flat))

In [29]:
# inspect size of lists created above
display(len(lvl2_flat))
len(lvl2_flat_unique)

749

The following code block served to remove the "observed_links" list from "lvl2_flat_unique". This is done for the purpose of the scope of the CSULB DST tasks. Images from "observed_links" are already present in out repo so don't want to repeat downloads from these links because it take "a good while" so we remove them from the "to-be downloaded" list.

In [32]:
from collections import Counter
to_be_downloaded = list((Counter(lvl2_flat_unique)-Counter(observed_links)).elements())
display(len(to_be_downloaded))

668