# Export to S3 Function

This notebook contains a function to export all files and folders from a SageMaker source directory to an S3 bucket folder.

In [6]:
import boto3
import os
from pathlib import Path
import logging
from botocore.exceptions import ClientError, NoCredentialsError

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [7]:
def export_directory_to_s3(bucket_name, s3_folder, source_directory, aws_access_key_id=None, aws_secret_access_key=None, region_name='us-east-1'):
    """
    Export all files and folders from a local directory to an S3 bucket folder.
    Overwrites existing files and adds new files.
    
    Args:
        bucket_name (str): Name of the S3 bucket
        s3_folder (str): Destination folder in S3 bucket
        source_directory (str): Local source directory path
        aws_access_key_id (str, optional): AWS access key ID. If None, uses default credentials
        aws_secret_access_key (str, optional): AWS secret access key. If None, uses default credentials
        region_name (str): AWS region name (default: 'us-east-1')
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Create S3 client
        if aws_access_key_id and aws_secret_access_key:
            s3_client = boto3.client(
                's3',
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                region_name=region_name
            )
        else:
            # Use default credentials (from ~/.aws/credentials, IAM role, etc.)
            s3_client = boto3.client('s3', region_name=region_name)
        
        # Ensure source directory exists
        source_path = Path(source_directory)
        if not source_path.exists():
            logger.error(f"Source directory '{source_directory}' does not exist")
            return False
        
        # Ensure s3_folder doesn't start with '/' and ends with '/'
        if s3_folder.startswith('/'):
            s3_folder = s3_folder[1:]
        if not s3_folder.endswith('/'):
            s3_folder += '/'
        
        logger.info(f"Starting upload from '{source_directory}' to 's3://{bucket_name}/{s3_folder}'")
        
        uploaded_count = 0
        
        # Walk through all files and directories in the source directory
        for root, dirs, files in os.walk(source_directory):
            # Process all files in current directory
            for file in files:
                # Get full local file path
                local_file_path = os.path.join(root, file)
                
                # Calculate relative path from source directory
                relative_path = os.path.relpath(local_file_path, source_directory)
                
                # Create S3 key (object path)
                s3_key = s3_folder + relative_path.replace('\\', '/')  # Use forward slashes for S3
                
                try:
                    # Upload file to S3
                    s3_client.upload_file(local_file_path, bucket_name, s3_key)
                    uploaded_count += 1
                    logger.info(f"Uploaded: {relative_path} -> s3://{bucket_name}/{s3_key}")
                    
                except ClientError as e:
                    logger.error(f"Error uploading {relative_path}: {e}")
                    continue
                except FileNotFoundError:
                    logger.error(f"File not found: {local_file_path}")
                    continue
        
        if uploaded_count == 0:
            logger.warning(f"No files found in directory '{source_directory}'")
        else:
            logger.info(f"Successfully uploaded {uploaded_count} files to s3://{bucket_name}/{s3_folder}")
        
        return True
        
    except NoCredentialsError:
        logger.error("AWS credentials not found. Please configure your credentials.")
        return False
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchBucket':
            logger.error(f"Bucket '{bucket_name}' does not exist")
        elif error_code == 'AccessDenied':
            logger.error(f"Access denied to bucket '{bucket_name}'. Check your permissions.")
        else:
            logger.error(f"AWS error: {e}")
        return False
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        return False

## Usage Examples

Here's how to use the function with your specific parameters:

In [8]:
# Example 1: Using your specific parameters (with default AWS credentials)
bucket_name = "sagemaker-studio-940544301691-64iyil0o5a"
s3_folder = "nlp_assignment"
source_directory = "/home/sagemaker-user/codespace/nlp_assignments"

success = export_directory_to_s3(bucket_name, s3_folder, source_directory)
if success:
    print(f"Successfully exported files from {source_directory} to s3://{bucket_name}/{s3_folder}/")
else:
    print("Export failed")

INFO:__main__:Starting upload from '/home/sagemaker-user/codespace/nlp_assignments' to 's3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/'
INFO:__main__:Uploaded: import_from_s3.ipynb -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/import_from_s3.ipynb
INFO:__main__:Uploaded: import_from_s3.ipynb -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/import_from_s3.ipynb
INFO:__main__:Uploaded: hw3.ipynb -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/hw3.ipynb
INFO:__main__:Uploaded: hw3.ipynb -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/hw3.ipynb
INFO:__main__:Uploaded: second-test-data-DIST.json -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/second-test-data-DIST.json
INFO:__main__:Uploaded: second-test-data-DIST.json -> s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/second-test-data-DIST.json
INFO:__main__:Uploaded: test-results.txt -> s3://sagemaker-studio-940544301691-64iyil0o5a

Successfully exported files from /home/sagemaker-user/codespace/nlp_assignments to s3://sagemaker-studio-940544301691-64iyil0o5a/nlp_assignment/


In [9]:
# # Example 2: Using explicit credentials (if needed)
# bucket_name = "sagemaker-studio-940544301691-64iyil0o5a"
# s3_folder = "nlp_assignment"
# source_directory = "/home/sagemaker-user/codespace/nlp_assignments"
# access_key = "YOUR_ACCESS_KEY_ID"  # Replace with your access key
# secret_key = "YOUR_SECRET_ACCESS_KEY"  # Replace with your secret key

# success = export_directory_to_s3(
#     bucket_name=bucket_name,
#     s3_folder=s3_folder,
#     source_directory=source_directory,
#     aws_access_key_id=access_key,
#     aws_secret_access_key=secret_key,
#     region_name='us-east-1'  # Adjust region if needed
# )

# if success:
#     print(f"Successfully exported files from {source_directory} to s3://{bucket_name}/{s3_folder}/")
# else:
#     print("Export failed")

In [10]:
# # Example 3: Quick function for your specific use case
# def export_nlp_assignments():
#     """Quick function to export your NLP assignments to S3"""
#     return export_directory_to_s3(
#         bucket_name="sagemaker-studio-940544301691-64iyil0o5a",
#         s3_folder="nlp_assignment",
#         source_directory="/home/sagemaker-user/codespace/nlp_assignments"
#     )

# # Simply call this function to export your files
# # success = export_nlp_assignments()
# # print("Export completed!" if success else "Export failed!")

## Function Features

- **Complete directory upload**: Uploads all files and subfolders recursively
- **Overwrites existing files**: Files with the same name will be replaced
- **Preserves folder structure**: Maintains the original directory structure in S3
- **Handles large files**: Uses efficient file upload methods
- **Error handling**: Comprehensive error handling and logging
- **Flexible authentication**: Supports both default AWS credentials and explicit credentials

## Prerequisites

1. **AWS credentials configured** in one of these ways:
   - IAM role (recommended for SageMaker)
   - AWS credentials file (`~/.aws/credentials`)
   - Environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`)
   - Or pass credentials directly to the function

2. **Required Python packages** (usually pre-installed in SageMaker):
   ```bash
   pip install boto3
   ```

3. **S3 permissions**: Write access to the specified bucket