In [27]:
from bs4 import BeautifulSoup
import requests
import time
import json
from tqdm import tqdm
import csv
import os

In [5]:
def get_links(search_url=None):
    """
    Note:
        Go to https://www.presidency.ucsb.edu/advanced-search and specify your cases then copy the changed url
        For example to download State of Union massages: https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from[date]=01-01-1945&to[date]=12-31-2021&person2=&category2[]=45&category2[]=400&items_per_page=100
    
    Retrun:
        links: list of extracted links
        
    ٍExample:
        link_of_documnets = get_links("https://www.presidency.ucsb.edu/advanced-search?field-keywords=&field-keywords2=&field-keywords3=&from[date]=01-01-1945&to[date]=12-31-2021&person2=&category2[]=45&category2[]=400&items_per_page=100")    """
    SLEEP_TIME = 2 # time between send queries in seconds
    PREFIX = "https://www.presidency.ucsb.edu"
    links = [] # list of extracted links
    

    while search_url:
        page = requests.get(search_url)
        soup = BeautifulSoup(page.text,"html.parser")
        
        for block in soup.find_all("tr", {"class": ["even", "odd"]}):
            founded_link = block.find("td", {"class": "views-field-title"}).find("a")["href"]
            links.append(PREFIX + founded_link)
            
        search_url = soup.find("a", {"title": "Go to next page"})
        if search_url:
            search_url = PREFIX + search_url["href"]

        time.sleep(SLEEP_TIME)

    return links

In [53]:
def get_documnets(links: list, PATH=None):
    
    SLEEP_TIME = 2 # time between send queries in seconds
    
    PATH = "speaches" if PATH is None else PATH.rstrip('/')

    HEADER = ["file", "link", "date", "title", "speaker", "citation", "categories"]
    
    with open('information.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        writer.writerow(HEADER)

    if not os.path.exists(PATH):
        os.makedirs(PATH)


    for index, link in enumerate(links):
        page = requests.get(link)
        soup = BeautifulSoup(page.text, "html.parser")
        text = soup.find("div", {"class": "field-docs-content"}).text
        date = soup.find("span", {"class": "date-display-single"}).text
        title = soup.find("div", {"class": "field-ds-doc-title"}).text.strip()
        speaker = soup.find("h3", {"class": "diet-title"}).text
        citation = soup.find("p", {"class": "ucsbapp_citation"}).text
        categories = str([i.text for i in soup.findAll(attrs={"property": "rdfs:label skos:prefLabel"})])
        
        data = [str(index)+".txt", link, date, title, speaker, citation, categories]
        
        with open('information.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=",")
            writer.writerow(data)

        with open(PATH + "/" + str(index)+".txt", 'w') as f:
            f.write(text)

        with open(PATH + "/" + str(index)+".html", 'w') as f:
            f.write(page.text)

        time.sleep(SLEEP_TIME)
