In [27]:
import os
import boto3
from botocore.exceptions import ClientError
from dotenv import load_dotenv
import time
import re

load_dotenv()

True

In [28]:
# Environment Configuration
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
SECRET_KEY = os.getenv("AWS_SECRET_KEY")
AWS_REGION = os.getenv("AWS_REGION")
GLUE_ROLE_ARN = os.getenv("GLUE_ROLE_ARN")
LAMBDA_ROLE_ARN = os.getenv("LAMBDA_ROLE_ARN")
S3_BUCKET_COMPLETE_PIPELINE = os.getenv("S3_BUCKET_COMPLETE_PIPELINE")
S3_UPLOAD_FOLDER = "raw/"
S3_PROCESSED_FOLDER = "processed/"
S3_GLUE_OUTPUT_FOLDER = "glue-output/"

# COMPLETE DATA LAKE WORKFLOW

```
+-----------------------------------------------------------------------------------+
|                        COMPLETE DATA LAKE PIPELINE                                |
+-----------------------------------------------------------------------------------+
|                                                                                   |
|  1. INGEST                                                                        |
|     +-------------------+                                                         |
|     | Upload to S3      |  <-- Manual, Lambda, Kinesis Firehose                   |
|     | (raw/ folder)     |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  2. CATALOG                                                                       |
|     +-------------------+                                                         |
|     | Glue Crawler      |  --> Discovers schema, creates tables                   |
|     | (Data Catalog)    |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  3. TRANSFORM                                                                     |
|     +-------------------+                                                         |
|     | Glue ETL Job      |  --> Clean, transform, convert to Parquet               |
|     | (processed/)      |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  4. CATALOG PROCESSED                                                             |
|     +-------------------+                                                         |
|     | Another Crawler   |  --> Update catalog with processed tables               |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  5. ANALYZE                                                                       |
|     +-------------------+                                                         |
|     | Athena SQL        |  --> Fast, serverless analytics                         |
|     | Queries           |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  6. VISUALIZE (Optional)                                                          |
|     +-------------------+                                                         |
|     | QuickSight        |  --> Dashboards, reports                                |
|     +-------------------+                                                         |
|                                                                                   |
|  All serverless, managed, pay-per-use!                                            |
+-----------------------------------------------------------------------------------+
```

# PHASE 1: DATA INGESTION

```
┌─────────────────────────────────────────────────────────────────────────────┐
│  STEP 1: INGEST DATA TO S3                                                  │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   Local Files ──────┐                                                       │
│                     │                                                       │
│   APIs/Streams ─────┼────►  S3 Bucket  ────►  raw/ folder                   │
│                     │      (Landing Zone)                                   │
│   Lambda Events ────┘                                                       │
│                                                                             │
│   Key Operations:                                                           │
│   • Create S3 bucket (landing zone)                                         │
│   • Upload raw data files                                                   │
│   • Verify uploads & read content                                           │
│   • Generate presigned URLs for sharing                                     │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

## 1.1 Initialize S3 Client

Configure the S3 client with **Signature Version 4** (required for presigned URLs) and regional endpoint.

In [29]:
# create an s3 client
from botocore.config import Config

# Configure S3 client with Signature Version 4 and regional endpoint
# Regional endpoint is required for presigned URLs to work correctly

s3_client = boto3.client('s3', 
                         endpoint_url=f'https://s3.{AWS_REGION}.amazonaws.com',
                         config=Config(signature_version='s3v4'),
                         region_name=AWS_REGION,
                         aws_secret_access_key=SECRET_KEY,
                         aws_access_key_id=ACCESS_KEY,
                         )

## 1.2 Create S3 Bucket (Landing Zone)

The bucket serves as the **landing zone** for all raw data. All incoming files go to the `raw/` prefix.

In [30]:
from botocore.exceptions import ClientError

def create_bucket(bucket_name, region=None):
    """
    Create an S3 bucket in a specified region
    
    Args:
        bucket_name (str): Name for the bucket (must be globally unique)
        region (str): AWS region (if None, uses default from client)
    
    Returns:
        bool: True if bucket created, False otherwise
    """
    try:
        if region is None or region == 'us-east-1':
            # us-east-1 doesn't require LocationConstraint
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            # Other regions require LocationConstraint
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={'LocationConstraint': region}
            )
        print(f"SUCCESS: Bucket '{bucket_name}' created successfully")
        return True
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'BucketAlreadyExists':
            print(f"ERROR: Bucket '{bucket_name}' already exists (owned by someone else)")
        elif error_code == 'BucketAlreadyOwnedByYou':
            print(f"INFO: Bucket '{bucket_name}' already exists and is owned by you")
        else:
            print(f"ERROR: Failed to create bucket - {e}")
        return False

# Example usage (uncomment to test):
create_bucket(S3_BUCKET_COMPLETE_PIPELINE, region=AWS_REGION)

INFO: Bucket 's3-complete-pipeline' already exists and is owned by you


False

## 1.3 Upload Raw Data

Upload files to the `raw/` folder. This is where Glue Crawlers will discover and catalog the data schema.

In [31]:
# upload a file to s3
def upload_file(file_name, bucket, object_name=None):
    """
    Upload a file to an S3 bucket
    
    Args:
        file_name (str): Path to file to upload
        bucket (str): Bucket name
        object_name (str): S3 object name (if None, uses file_name)
    
    Returns:
        bool: True if upload successful, False otherwise
    """
    # If S3 object_name not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)
    
    try:
        s3_client.upload_file(file_name, bucket, object_name)
        print(f"SUCCESS: '{file_name}' uploaded to '{bucket}/{object_name}'")
        return True
    except FileNotFoundError:
        print(f"ERROR: File '{file_name}' not found")
        return False
    except ClientError as e:
        print(f"ERROR: Failed to upload file - {e}")
        return False

# Example usage (uncomment to test):
upload_file('data/hosts.csv', S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')

SUCCESS: 'data/hosts.csv' uploaded to 's3-complete-pipeline/raw/hosts.csv'


True

## 1.4 Verify Upload (Read Content)

Read the uploaded file directly from S3 to confirm the data landed correctly.

In [32]:
def read_object(bucket, object_name):
    """
    Read S3 object content directly into memory
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name to read
    
    Returns:
        str: File content as string, or None if error
    """
    try:
        response = s3_client.get_object(Bucket=bucket, Key=object_name)
        content = response['Body'].read().decode('utf-8')
        print(f"SUCCESS: Read {len(content)} characters from '{bucket}/{object_name}'")
        return content
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchKey':
            print(f"ERROR: Object '{object_name}' not found in bucket '{bucket}'")
        else:
            print(f"ERROR: Failed to read object - {e}")
        return None
    except Exception as e:
        print(f"ERROR: Unexpected error - {e}")
        return None

# Example usage (uncomment to test):
content = read_object(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')
if content:
    print("Content:")
    print(content[:500])  # Print first 500 characters

SUCCESS: Read 13083 characters from 's3-complete-pipeline/raw/hosts.csv'
Content:
host_id,host_name,host_since,is_superhost,response_rate,created_at
1,Timothy Parker,2018-03-20,False,99,2025-12-26 14:15:54.011160
2,Hannah Evans,2024-01-01,False,95,2025-12-26 14:15:54.011160
3,Crystal Green,2016-08-06,False,74,2025-12-26 14:15:54.011160
4,Kevin Johnson,2020-02-25,False,100,2025-12-26 14:15:54.011160
5,Monica Johnson,2024-11-11,False,77,2025-12-26 14:15:54.011160
6,Nancy Turner,2016-11-03,False,96,2025-12-26 14:15:54.011160
7,Gerald Hunt,2022-01-21,True,99,2025-12-26 14:


## 1.5 List Objects in Bucket

View all objects in the `raw/` folder with detailed metadata (size, last modified, storage class).

In [33]:
from datetime import datetime

def list_objects_detailed(bucket, prefix=''):
    """
    List all objects in a bucket with detailed metadata
    
    Args:
        bucket (str): Bucket name
        prefix (str): Filter objects by prefix (folder path)
    
    Returns:
        list: List of object metadata dictionaries, or empty list if error
    """
    try:
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' not in response:
            print(f"No objects found in bucket '{bucket}' with prefix '{prefix}'")
            return []
        
        objects = []
        print(f"Objects in '{bucket}/{prefix}':")
        
        for obj in response['Contents']:
            # Convert size to human-readable format
            size_bytes = obj['Size']
            if size_bytes < 1024:
                size_str = f"{size_bytes} B"
            elif size_bytes < 1024**2:
                size_str = f"{size_bytes/1024:.2f} KB"
            else:
                size_str = f"{size_bytes/(1024**2):.2f} MB"
            
            # Format last modified date
            last_modified = obj['LastModified'].strftime('%Y-%m-%d %H:%M:%S')
            
            print(f"Key: {obj['Key']}")
            print(f"Size: {size_str} ({size_bytes:,} bytes)")
            print(f"Last Modified: {last_modified}")
            print(f"Storage Class: {obj.get('StorageClass', 'STANDARD')}")
            print(f"ETag: {obj['ETag']}")
            
            objects.append(obj)
        
        print(f"Total: {len(objects)} object(s)")
        return objects
        
    except ClientError as e:
        print(f"ERROR: Failed to list objects - {e}")
        return []

# Example usage (uncomment to test):
list_objects_detailed(S3_BUCKET_COMPLETE_PIPELINE, prefix=f'{S3_UPLOAD_FOLDER}')

Objects in 's3-complete-pipeline/raw/':
Key: raw/hosts.csv
Size: 12.78 KB (13,083 bytes)
Last Modified: 2026-01-31 03:02:32
Storage Class: STANDARD
ETag: "7588197a4f4c485949e7bfc641356122"
Total: 1 object(s)


[{'Key': 'raw/hosts.csv',
  'LastModified': datetime.datetime(2026, 1, 31, 3, 2, 32, tzinfo=tzutc()),
  'ETag': '"7588197a4f4c485949e7bfc641356122"',
  'ChecksumAlgorithm': ['CRC32'],
  'ChecksumType': 'FULL_OBJECT',
  'Size': 13083,
  'StorageClass': 'STANDARD'}]

## 1.6 Get Object Metadata

Retrieve detailed metadata for a specific object (content type, encryption, custom tags).

In [34]:
def get_object_metadata(bucket, object_name):
    """
    Retrieve metadata for an S3 object
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name (key)
    
    Returns:
        dict: Metadata dictionary, or None if error
    """
    try:
        response = s3_client.head_object(Bucket=bucket, Key=object_name)
        
        print(f"Metadata for '{bucket}/{object_name}':")
        
        # System metadata
        print("SYSTEM METADATA:")
        print(f"Content-Type: {response.get('ContentType', 'N/A')}")
        print(f"Content-Length: {response.get('ContentLength', 0):,} bytes")
        print(f"Last-Modified: {response.get('LastModified', 'N/A')}")
        print(f"ETag: {response.get('ETag', 'N/A')}")
        print(f"Storage-Class: {response.get('StorageClass', 'STANDARD')}")
        
        # User metadata (custom)
        user_metadata = response.get('Metadata', {})
        if user_metadata:
            print("USER METADATA (Custom):")
            for key, value in user_metadata.items():
                print(f"  {key}: {value}")
        else:
            print("USER METADATA: None")
        
        return response
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            print(f"ERROR: Object '{object_name}' not found in bucket '{bucket}'")
        else:
            print(f"ERROR: Failed to get metadata - {e}")
        return None

# Example usage (uncomment to test):
get_object_metadata(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')

Metadata for 's3-complete-pipeline/raw/hosts.csv':
SYSTEM METADATA:
Content-Type: binary/octet-stream
Content-Length: 13,083 bytes
Last-Modified: 2026-01-31 03:02:32+00:00
ETag: "7588197a4f4c485949e7bfc641356122"
Storage-Class: STANDARD
USER METADATA: None


{'ResponseMetadata': {'RequestId': 'W1QD945F3JFZGWF0',
  'HostId': 'JDY8bqiBgPQQdc9mhRexUjWGk+L4itRZrVyrD+77LMFXx7seuBMPzgEeN6dAuj4WDzNYLB4zhws=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JDY8bqiBgPQQdc9mhRexUjWGk+L4itRZrVyrD+77LMFXx7seuBMPzgEeN6dAuj4WDzNYLB4zhws=',
   'x-amz-request-id': 'W1QD945F3JFZGWF0',
   'date': 'Sat, 31 Jan 2026 03:03:04 GMT',
   'last-modified': 'Sat, 31 Jan 2026 03:02:32 GMT',
   'etag': '"7588197a4f4c485949e7bfc641356122"',
   'x-amz-server-side-encryption': 'AES256',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '13083',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2026, 1, 31, 3, 2, 32, tzinfo=tzutc()),
 'ContentLength': 13083,
 'ETag': '"7588197a4f4c485949e7bfc641356122"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {}}

## 1.7 Generate Presigned URL (Share Data)

Create time-limited URLs for secure sharing without exposing AWS credentials.

| Expiration | Use Case |
|------------|----------|
| 600s (10 min) | Quick one-time downloads |
| 3600s (1 hour) | Team collaboration |
| 86400s (24 hours) | External sharing |
| 604800s (7 days) | Maximum allowed |

In [35]:
def generate_presigned_download_url(bucket, object_name, expiration=3600):
    """
    Generate a presigned URL for downloading an S3 object
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name (key)
        expiration (int): URL expiration time in seconds (default 3600 = 1 hour)
    
    Returns:
        str: Presigned URL, or None if error
    
    Common expiration times:
        - 3600 = 1 hour (default)
        - 7200 = 2 hours
        - 86400 = 24 hours
        - 604800 = 7 days (maximum)
    
    Note: Uses AWS Signature Version 4 (required by S3)
    """
    try:
        url = s3_client.generate_presigned_url(
            'get_object',
            Params={
                'Bucket': bucket,
                'Key': object_name
            },
            ExpiresIn=expiration
        )
        
        print(f"SUCCESS: Presigned URL generated for '{bucket}/{object_name}'")
        print(f"Expires in: {expiration} seconds ({expiration/3600:.1f} hours)")
        print(f"\nURL (valid for {expiration/3600:.1f} hours):")
        print(url)
        print("\nAnyone with this URL can download the file until it expires.")
        
        return url
        
    except ClientError as e:
        print(f"ERROR: Failed to generate presigned URL - {e}")
        return None


# Example usage (uncomment to test):
url = generate_presigned_download_url(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv', expiration=600)

SUCCESS: Presigned URL generated for 's3-complete-pipeline/raw/hosts.csv'
Expires in: 600 seconds (0.2 hours)

URL (valid for 0.2 hours):
https://s3.us-east-2.amazonaws.com/s3-complete-pipeline/raw/hosts.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAWPVGU3OO6BQJAVE3%2F20260131%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20260131T030321Z&X-Amz-Expires=600&X-Amz-SignedHeaders=host&X-Amz-Signature=32c0c87db9fa04e5c387a2c5e06e1ff6d693ac6fa3b0b96553197a457e9e52ae

Anyone with this URL can download the file until it expires.
