In [7]:
import requests
import boto3
from bs4 import BeautifulSoup
import re

In [8]:
# Initializing the Boto3 S3 client
s3_client = boto3.client('s3')

# Base URL for the fulltext directory (1976-2000)
base_url_1976_2000 = 'https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/'

# S3 bucket name and folder
bucket_name = 'patent-research-assistant'  # Replace with your actual bucket name
s3_folder = 'uspto/fulltext'  # Target folder structure within the S3 bucket

# Defining the years for which the files need to be downloaded (1976 to 2000)
years_to_download = range(1976, 2001)

In [9]:
# Looping over each year to access its specific directory
for year in years_to_download:
    # Constructing the URL for the year directory
    year_url = f"{base_url_1976_2000}{year}/"
    zip_file_pattern = r'pftaps\d{8}_wk\d{2}\.zip$'  
    pdf_file_pattern = r'.*\.pdf$'  

    print(f"Accessing {year_url}...")

    try:
        # Sending a GET request to retrieve the page content
        response = requests.get(year_url)
        response.raise_for_status()

        # Parsing the page content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Finding all .zip and .pdf links matching the specified patterns
        for link in soup.find_all('a', href=True):
            file_name = link['href']
            if re.match(zip_file_pattern, file_name) or re.match(pdf_file_pattern, file_name):
                file_url = year_url + file_name
                
                # Setting the S3 key with the target folder structure
                s3_key = f"{s3_folder}/{year}/{file_name}"

                print(f"Downloading {file_name} from {file_url}...")

                # Downloading the file
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status()

                # Uploading directly to S3
                print(f"Uploading {file_name} to S3 bucket {bucket_name} at {s3_key}...")
                s3_client.upload_fileobj(file_response.raw, bucket_name, s3_key)

                print(f"Uploaded {file_name} successfully to {s3_key}.")

    except requests.exceptions.RequestException as e:
        print(f"Failed to access {year_url}: {e}")

print("All specified files downloaded and uploaded to S3.")

Accessing https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2000/...
Downloading PatentFullTextAPSDoc_GreenBook.pdf from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2000/PatentFullTextAPSDoc_GreenBook.pdf...
Uploading PatentFullTextAPSDoc_GreenBook.pdf to S3 bucket patent-research-assistant at uspto/fulltext/2000/PatentFullTextAPSDoc_GreenBook.pdf...
Uploaded PatentFullTextAPSDoc_GreenBook.pdf successfully to uspto/fulltext/2000/PatentFullTextAPSDoc_GreenBook.pdf.
Downloading pftaps20000104_wk01.zip from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2000/pftaps20000104_wk01.zip...
Uploading pftaps20000104_wk01.zip to S3 bucket patent-research-assistant at uspto/fulltext/2000/pftaps20000104_wk01.zip...
Uploaded pftaps20000104_wk01.zip successfully to uspto/fulltext/2000/pftaps20000104_wk01.zip.
Downloading pftaps20000111_wk02.zip from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2000/pftaps20000111_wk02.zip...
Uploading pfta