In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def gather_project_documents(year, url):
    data = []    

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    if year >= 2014:
        projects = soup.find_all("div", class_="cproject")
    else:
        projects = soup.find_all("article")

    for proj in projects:
        h3 = proj.find("h3")
        if not h3:
            continue
        project_name = h3.get_text(strip=True)

        # Find all links inside descendants with class="arr"
        arr_containers = proj.find_all(class_="arr")
        links = []
        for container in arr_containers:
            for a in container.find_all("a", href=True):
                links.append(a["href"])

        # Find the closest preceding <h2>
        section_h2 = None
        for prev in proj.find_all_previous():
            if prev.name == "h2":
                section_h2 = prev.get_text(strip=True)
                break

        data.append({
            "year": year,
            "borough": section_h2,
            "name": project_name,
            "document_links": links,
            "source_url": url
        })
        
    return pd.DataFrame(data)


data_dict = {}

URL_TEMPLATE = "https://www.nyc.gov/html/dot/html/about/{final_part}.shtml"

for year in range(2007,2026):
    if year == 2025:
        final_part = 'current-projects'
    else:
        final_part = f'projects-{year}'
    
    url = URL_TEMPLATE.format(final_part=final_part)

    try:
        year_data = gather_project_documents(year, url)
        data_dict[str(year)] = year_data
        print(f'\t{year}: {year_data.shape}!')
    except Exception as e:
        print(f'\t{year}: {e}')
    


	2007: (18, 5)!
	2008: (23, 5)!
	2009: (14, 5)!
	2010: (32, 5)!
	2011: (31, 5)!
	2012: (40, 5)!
	2013: (46, 5)!
	2014: (51, 5)!
	2015: (47, 5)!
	2016: (71, 5)!
	2017: (45, 5)!
	2018: (63, 5)!
	2019: (66, 5)!
	2020: (49, 5)!
	2021: (56, 5)!
	2022: (35, 5)!
	2023: (60, 5)!
	2024: (42, 5)!
	2025: (152, 5)!


In [None]:
#pd.concat(data_dict)
[(k, v.shape) for k, v in data_dict.items()]
projects_df = pd.concat(data_dict, ignore_index=True) # 941 projects
#projects_df[['year','name']].value_counts()
projects_df['document_links'].explode() # 1,813 documents

projects_df.to_csv('../data/project_documents/projects_df.csv')



In [None]:
# Now actually download the documents
projects_df.to_records()

import os

def download_pdf_docs(rec, base_path='../data/project_documents/'):
    idx = rec.name
    name = rec['name'].replace('/', '--')
    doc_urls = rec['document_links']
    #if os.path.exists(base_path):

    # Create a new folder for it if it doesn't exist
    proj_dirname = f'{idx}--{name}'
    proj_dirpath = os.path.join(base_path, proj_dirname)
    if not os.path.exists(proj_dirpath):
        os.mkdir(proj_dirpath)

    i = 0
    while i < len(doc_urls):
        docname = f"{idx}--{i}--{doc_urls[i].split('/')[-1]}"
        
        # download the documents
        try:
            r = requests.get(doc_urls[i])
            r.raise_for_status()
            with open(os.path.join(proj_dirpath, docname), 'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(f'\tFailed to download {idx}: {doc_urls[i]}: {e}')

        i=i+1

    print(f"Proj: {idx}: {i} Doc{'s' if i != 1 else ''} Written!")

#download_pdf_docs(projects_df.iloc[0])
#projects_df.iloc[849:].apply(download_pdf_docs, axis=1)
#projects_df.apply(download_pdf_docs, axis=1)

Proj: 849: 6 Docs Written!
Proj: 850: 2 Docs Written!
Proj: 851: 1 Doc Written!
Proj: 852: 1 Doc Written!
Proj: 853: 1 Doc Written!
Proj: 854: 17 Docs Written!
Proj: 855: 2 Docs Written!
Proj: 856: 6 Docs Written!
Proj: 857: 2 Docs Written!
Proj: 858: 1 Doc Written!
Proj: 859: 8 Docs Written!
Proj: 860: 6 Docs Written!
Proj: 861: 2 Docs Written!
Proj: 862: 1 Doc Written!
Proj: 863: 1 Doc Written!
Proj: 864: 2 Docs Written!
Proj: 865: 2 Docs Written!
Proj: 866: 2 Docs Written!
Proj: 867: 6 Docs Written!
Proj: 868: 1 Doc Written!
Proj: 869: 1 Doc Written!
Proj: 870: 1 Doc Written!
Proj: 871: 3 Docs Written!
Proj: 872: 1 Doc Written!
Proj: 873: 10 Docs Written!
Proj: 874: 1 Doc Written!
Proj: 875: 3 Docs Written!
Proj: 876: 1 Doc Written!
Proj: 877: 2 Docs Written!
Proj: 878: 2 Docs Written!
Proj: 879: 1 Doc Written!
Proj: 880: 1 Doc Written!
Proj: 881: 2 Docs Written!
Proj: 882: 1 Doc Written!
Proj: 883: 1 Doc Written!
Proj: 884: 1 Doc Written!
Proj: 885: 1 Doc Written!
Proj: 886: 1 Doc 

849    None
850    None
851    None
852    None
853    None
       ... 
936    None
937    None
938    None
939    None
940    None
Length: 92, dtype: object