In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import urllib.parse

# Initial setup
base_url = "https://www.irs.gov/forms-instructions-and-publications?page=5"  # Replace with the actual URL of the first page
download_base_url = "https://www.irs.gov"  # The base URL to prepend to the href paths
download_folder = "pdf_downloads"
output_file = 'pdf_links.txt'

# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Function to extract href links from the current page
def extract_links(soup):
    links = []
    table = soup.find('table')  # Assuming there's only one table or the first one is relevant
    for row in table.find_all('tr')[1:]:  # Skip the header row
        product_number_cell = row.find('td')
        if product_number_cell:
            href = product_number_cell.find('a', href=True)
            if href:
                links.append(href['href'])
    return links

# Function to get the next page's URL
def get_next_page_url(soup, base_url):
    next_page_tag = soup.find('li', class_='pager__item pager__item--next')
    if next_page_tag:
        next_link = next_page_tag.find('a', href=True)
        if next_link:
            return urllib.parse.urljoin(base_url, next_link['href'])
    return None

# Function to download the PDF file
def download_pdf(pdf_url, download_folder):
    response = requests.get(pdf_url)
    filename = os.path.join(download_folder, pdf_url.split('/')[-1])
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded: {filename}")

# Scrape all pages
current_url = base_url
while current_url:
    response = requests.get(current_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = extract_links(soup)
    
    # Write the links to the file after each page
    with open(output_file, 'a') as f:
        for link in links:
            full_link = urllib.parse.urljoin(download_base_url, link)
            f.write(f"{full_link}\n")
            download_pdf(full_link, download_folder)
    
    print(f"Extracted {len(links)} links from {current_url}.")
    
    current_url = get_next_page_url(soup, base_url)
    
    # Add a random sleep interval between requests to avoid being detected as a bot
    sleep_time = random.uniform(10, 50)  # Random sleep time between 10 and 50 seconds
    print(f"Sleeping for {sleep_time:.2f} seconds...")
    time.sleep(sleep_time)

print("Scraping and downloading completed.")


Downloaded: pdf_downloads\p571.pdf
Downloaded: pdf_downloads\p575.pdf
Downloaded: pdf_downloads\p583.pdf
Downloaded: pdf_downloads\p584.pdf
Downloaded: pdf_downloads\p584b.pdf
Downloaded: pdf_downloads\p584sp.pdf
Downloaded: pdf_downloads\p587.pdf
Downloaded: pdf_downloads\p590a.pdf
Downloaded: pdf_downloads\p590b.pdf
Downloaded: pdf_downloads\p594.pdf
Downloaded: pdf_downloads\p594sp.pdf
Downloaded: pdf_downloads\p594zhs.pdf
Downloaded: pdf_downloads\p594zht.pdf
Downloaded: pdf_downloads\p595.pdf
Downloaded: pdf_downloads\p596.pdf
Downloaded: pdf_downloads\p596ko.pdf
Downloaded: pdf_downloads\p596ru.pdf
Downloaded: pdf_downloads\p596sp.pdf
Downloaded: pdf_downloads\p596vie.pdf
Downloaded: pdf_downloads\p596zhs.pdf
Downloaded: pdf_downloads\p596zht.pdf
Downloaded: pdf_downloads\p597.pdf
Downloaded: pdf_downloads\p598.pdf
Downloaded: pdf_downloads\n609.pdf
Downloaded: pdf_downloads\n609s.pdf
Extracted 25 links from https://www.irs.gov/forms-instructions-and-publications?page=5.
Sleeping

KeyboardInterrupt: 

In [2]:
import os
import shutil

# Path to the folder containing your PDF files
source_folder = "pdf_downloads"

# Path to the folder where the alphabet folders will be created
destination_folder = "filtered_pdf"

# Iterate through each file in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):
        # Get the first letter of the file name
        first_letter = filename[0].upper()
        
        # If the first letter is an alphabetic character, create the folder if necessary and copy the file
        if 'A' <= first_letter <= 'Z':
            letter_folder = os.path.join(destination_folder, first_letter)
            
            # Create the folder for the first letter if it doesn't exist
            if not os.path.exists(letter_folder):
                os.makedirs(letter_folder)
            
            # Copy the file to the corresponding folder
            source_path = os.path.join(source_folder, filename)
            destination_path = os.path.join(letter_folder, filename)
            shutil.copy2(source_path, destination_path)
        else:
            # Handle files that do not start with a letter (optional)
            others_folder = os.path.join(destination_folder, "Others")
            if not os.path.exists(others_folder):
                os.makedirs(others_folder)
            shutil.copy2(os.path.join(source_folder, filename), os.path.join(others_folder, filename))