In [None]:
!pip install requests boto3 beautifulsoup4

In [2]:
import requests
import boto3
from bs4 import BeautifulSoup
import re

In [3]:
# Initializing the Boto3 S3 client
s3_client = boto3.client('s3')

# Base URL for the fulltext directory (1971-1975)
base_url_1971_1975 = 'https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/'

# S3 bucket name and folder
bucket_name = 'patent-research-assistant'  
s3_folder = 'uspto/fulltext'

# Defining the years for which we need to download the files (1971 to 1975)
years_to_download = range(1971, 1976)

In [4]:
# Looping over each year to access its specific directory
for year in years_to_download:
    # Constructing the URL for the year directory
    year_url = f"{base_url_1971_1975}{year}/"
    zip_file_pattern = r'US_PATFT_BRS_Full_Text_Extract_\d+_\d+\.zip$'
    doc_file_pattern = r'US_PATFT_BRS_form\.doc$'

    print(f"Accessing {year_url}...")

    try:
        # Sending a GET request to retrieve the page content
        response = requests.get(year_url)
        response.raise_for_status()

        # Parsing the page content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Finding all .zip and .doc links matching the specified patterns
        for link in soup.find_all('a', href=True):
            file_name = link['href']
            if re.match(zip_file_pattern, file_name) or re.match(doc_file_pattern, file_name):
                file_url = year_url + file_name
                
                # Setting the S3 key with the target folder structure
                s3_key = f"{s3_folder}/{year}/{file_name}"

                print(f"Downloading {file_name} from {file_url}...")

                # Downloading the file
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status()

                # Uploading directly to S3
                print(f"Uploading {file_name} to S3 bucket {bucket_name} at {s3_key}...")
                s3_client.upload_fileobj(file_response.raw, bucket_name, s3_key)

                print(f"Uploaded {file_name} successfully to {s3_key}.")

    except requests.exceptions.RequestException as e:
        print(f"Failed to access {year_url}: {e}")

print("All specified files downloaded and uploaded to S3.")

Accessing https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1971/...
Downloading US_PATFT_BRS_Full_Text_Extract_1971_1.zip from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_1.zip...
Uploading US_PATFT_BRS_Full_Text_Extract_1971_1.zip to S3 bucket patent-research-assistant at uspto/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_1.zip...
Uploaded US_PATFT_BRS_Full_Text_Extract_1971_1.zip successfully to uspto/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_1.zip.
Downloading US_PATFT_BRS_Full_Text_Extract_1971_2.zip from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_2.zip...
Uploading US_PATFT_BRS_Full_Text_Extract_1971_2.zip to S3 bucket patent-research-assistant at uspto/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_2.zip...
Uploaded US_PATFT_BRS_Full_Text_Extract_1971_2.zip successfully to uspto/fulltext/1971/US_PATFT_BRS_Full_Text_Extract_1971_2.zip.