# AWS S3 Bucket Import Function

This notebook contains a function to download all files and folders from an AWS S3 bucket to a local destination directory.

In [1]:
!pip install boto3



In [2]:
import boto3
import os
from botocore.exceptions import ClientError, NoCredentialsError
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
def import_s3_bucket_to_directory(bucket_name, destination_directory, aws_access_key_id=None, aws_secret_access_key=None, region_name='us-east-1'):
    """
    Download all files and folders from an AWS S3 bucket to a local destination directory.
    
    Args:
        bucket_name (str): Name of the S3 bucket
        destination_directory (str): Local directory path where files will be downloaded
        aws_access_key_id (str, optional): AWS access key ID. If None, uses default credentials
        aws_secret_access_key (str, optional): AWS secret access key. If None, uses default credentials
        region_name (str): AWS region name (default: 'us-east-1')
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Create S3 client
        if aws_access_key_id and aws_secret_access_key:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                region_name=region_name
            )
        else:
            # Use default credentials (from ~/.aws/credentials, IAM role, etc.)
            s3_client = boto3.client('s3', region_name=region_name)
        
        # Create destination directory if it doesn't exist
        Path(destination_directory).mkdir(parents=True, exist_ok=True)
        
        # List all objects in the bucket
        logger.info(f"Starting download from bucket '{bucket_name}' to '{destination_directory}'")
        
        paginator = s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket_name)
        
        downloaded_count = 0
        
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    # Get the object key (file path in S3)
                    s3_key = obj['Key']
                    
                    # Skip if it's a folder (ends with '/')
                    if s3_key.endswith('/'):
                        # Create the directory structure
                        local_dir_path = os.path.join(destination_directory, s3_key)
                        Path(local_dir_path).mkdir(parents=True, exist_ok=True)
                        logger.info(f"Created directory: {local_dir_path}")
                        continue
                    
                    # Create local file path
                    local_file_path = os.path.join(destination_directory, s3_key)
                    
                    # Create directories if they don't exist
                    local_dir = os.path.dirname(local_file_path)
                    if local_dir:
                        Path(local_dir).mkdir(parents=True, exist_ok=True)
                    
                    # Download the file
                    try:
                        s3_client.download_file(bucket_name, s3_key, local_file_path)
                        downloaded_count += 1
                        logger.info(f"Downloaded: {s3_key} -> {local_file_path}")
                    except ClientError as e:
                        logger.error(f"Error downloading {s3_key}: {e}")
                        continue
        
        if downloaded_count == 0:
            logger.warning(f"No files found in bucket '{bucket_name}' or bucket is empty")
        else:
            logger.info(f"Successfully downloaded {downloaded_count} files from bucket '{bucket_name}'")
        
        return True
        
    except NoCredentialsError:
        logger.error("AWS credentials not found. Please configure your credentials.")
        return False
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchBucket':
            logger.error(f"Bucket '{bucket_name}' does not exist")
        elif error_code == 'AccessDenied':
            logger.error(f"Access denied to bucket '{bucket_name}'. Check your permissions.")
        else:
            logger.error(f"AWS error: {e}")
        return False
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return False

## Usage Examples

Here are some examples of how to use the `import_s3_bucket_to_directory` function:

In [5]:
# Example 1: Using default credentials (from ~/.aws/credentials or IAM role)
bucket_name = "sagemaker-studio-940544301691-64iyil0o5a"
destination_dir = "/home/sagemaker-user/codespace/nlp_assignments"

success = import_s3_bucket_to_directory(bucket_name, destination_dir)
if success:
    print(f"Successfully imported files from {bucket_name} to {destination_dir}")
else:
    print("Import failed")

INFO:__main__:Starting download from bucket 'sagemaker-studio-940544301691-64iyil0o5a' to '/home/sagemaker-user/codespace/nlp_assignments'
INFO:__main__:Downloaded: hw3.ipynb -> /home/sagemaker-user/codespace/nlp_assignments/hw3.ipynb
INFO:__main__:Downloaded: hw3.ipynb -> /home/sagemaker-user/codespace/nlp_assignments/hw3.ipynb
INFO:__main__:Downloaded: second-test-data-DIST.json -> /home/sagemaker-user/codespace/nlp_assignments/second-test-data-DIST.json
INFO:__main__:Successfully downloaded 2 files from bucket 'sagemaker-studio-940544301691-64iyil0o5a'
INFO:__main__:Downloaded: second-test-data-DIST.json -> /home/sagemaker-user/codespace/nlp_assignments/second-test-data-DIST.json
INFO:__main__:Successfully downloaded 2 files from bucket 'sagemaker-studio-940544301691-64iyil0o5a'


Successfully imported files from sagemaker-studio-940544301691-64iyil0o5a to /home/sagemaker-user/codespace/nlp_assignments


In [None]:
# Example 2: Using explicit credentials
bucket_name = "my-s3-bucket"
destination_dir = "./downloaded_files"
access_key = "YOUR_ACCESS_KEY_ID"
secret_key = "YOUR_SECRET_ACCESS_KEY"

success = import_s3_bucket_to_directory(
    bucket_name=bucket_name,
    destination_directory=destination_dir,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name='us-west-2'  # Optional: specify different region
)

if success:
    print(f"Successfully imported files from {bucket_name} to {destination_dir}")
else:
    print("Import failed")

## Prerequisites

Before using this function, make sure you have:

1. **AWS credentials configured** in one of these ways:
   - AWS credentials file (`~/.aws/credentials`)
   - Environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`)
   - IAM role (if running on EC2)
   - Or pass credentials directly to the function

2. **Required Python packages installed**:
   ```bash
   pip install boto3
   ```

3. **Appropriate S3 permissions** for the bucket you want to access