In [1]:
from bs4 import BeautifulSoup
import requests
import time
import json
import csv
import os

In [2]:
def get_links(search_url=None):
    """
    Note:
        Go to https://www.presidency.ucsb.edu/advanced-search and specify your cases then copy the changed url
        For example to download State of Union massages: https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from[date]=01-01-1945&to[date]=12-31-2021&person2=&category2[]=45&category2[]=400&items_per_page=100
    
    Retrun:
        links: list of extracted links
        
    ٍExample:
        link_of_documnets = get_links("https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from[date]=01-01-1945&to[date]=12-31-2021&person2=&category2[]=45&category2[]=400&items_per_page=100")        
    """
    
    SLEEP_TIME = 2 # time between send queries in seconds
    PREFIX = "https://www.presidency.ucsb.edu"
    links = [] # list of extracted links
    
    while search_url:
        html_page = requests.get(search_url)
        soup = BeautifulSoup(html_page.text, "html.parser")
        
        # find all link in the current page
        for block in soup.find_all("tr", {"class": ["even", "odd"]}):
            founded_link = block.find("td", {"class": "views-field-title"}).find("a")["href"]
            links.append(PREFIX + founded_link)
        
        # next page link
        search_url = soup.find("a", {"title": "Go to next page"})
        if search_url:
            search_url = PREFIX + search_url["href"]

        # time sleep
        time.sleep(SLEEP_TIME)

    return links

In [3]:
def get_documnets(links: list, PATH=None):
    
    
    SLEEP_TIME = 2 # time between send queries in seconds
    PATH = "data" if PATH is None else PATH.rstrip('/') # path of downloaded documents
    HEADER = ["file", "link", "date", "title", "speaker", "citation", "categories"] # header of csv file

    # create the path if not exist
    if not os.path.exists(PATH + '/speaches'):
        os.makedirs(PATH + '/speaches')

    # generate csv file including information of downloaded documents0
    with open(PATH + '/information.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        writer.writerow(HEADER)


    # extract data
    for index, link in enumerate(links):
        html_page = requests.get(link)
        soup = BeautifulSoup(html_page.text, "html.parser")
        text = soup.find("div", {"class": "field-docs-content"}).text
        date = soup.find("span", {"class": "date-display-single"}).text
        title = soup.find("div", {"class": "field-ds-doc-title"}).text.strip()
        speaker = soup.find("h3", {"class": "diet-title"}).text
        citation = soup.find("p", {"class": "ucsbapp_citation"}).text
        categories = str([i.text for i in soup.findAll(attrs={"property": "rdfs:label skos:prefLabel"})])
        data = [str(index)+".txt", link, date, title, speaker, citation, categories]
        
        
        # store data
        with open(PATH + '/information.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            writer.writerow(data)

        with open(PATH + "/speaches/" + str(index)+".txt", 'w') as f:
            f.write(text)

        with open(PATH + "/speaches/" + str(index)+".html", 'w') as f:
            f.write(html_page.text)
            
        # time sleep
        time.sleep(SLEEP_TIME)

In [4]:
def get_documnets_with_url(search_url=None, PATH=None):
    link_of_documnets = get_links(search_url=search_url)
    get_documnets(links=link_of_documnets, PATH=PATH)

In [5]:
get_documnets_with_url('https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from[date]=01-01-1945&to[date]=12-31-2021&person2=&category2[]=45&category2[]=400&items_per_page=100', 'SOTU_data')