In [27]:
import os
import boto3
from botocore.exceptions import ClientError
from dotenv import load_dotenv
import time
import re

load_dotenv()

True

In [28]:
# Environment Configuration
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
SECRET_KEY = os.getenv("AWS_SECRET_KEY")
AWS_REGION = os.getenv("AWS_REGION")
GLUE_ROLE_ARN = os.getenv("GLUE_ROLE_ARN")
LAMBDA_ROLE_ARN = os.getenv("LAMBDA_ROLE_ARN")
S3_BUCKET_COMPLETE_PIPELINE = os.getenv("S3_BUCKET_COMPLETE_PIPELINE")
S3_UPLOAD_FOLDER = "raw/"
S3_PROCESSED_FOLDER = "processed/"
S3_GLUE_OUTPUT_FOLDER = "glue-output/"

# COMPLETE DATA LAKE WORKFLOW

```
+-----------------------------------------------------------------------------------+
|                        COMPLETE DATA LAKE PIPELINE                                |
+-----------------------------------------------------------------------------------+
|                                                                                   |
|  1. INGEST                                                                        |
|     +-------------------+                                                         |
|     | Upload to S3      |  <-- Manual, Lambda, Kinesis Firehose                   |
|     | (raw/ folder)     |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  2. CATALOG                                                                       |
|     +-------------------+                                                         |
|     | Glue Crawler      |  --> Discovers schema, creates tables                   |
|     | (Data Catalog)    |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  3. TRANSFORM                                                                     |
|     +-------------------+                                                         |
|     | Glue ETL Job      |  --> Clean, transform, convert to Parquet               |
|     | (processed/)      |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  4. CATALOG PROCESSED                                                             |
|     +-------------------+                                                         |
|     | Another Crawler   |  --> Update catalog with processed tables               |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  5. ANALYZE                                                                       |
|     +-------------------+                                                         |
|     | Athena SQL        |  --> Fast, serverless analytics                         |
|     | Queries           |                                                         |
|     +-------------------+                                                         |
|              |                                                                    |
|              v                                                                    |
|  6. VISUALIZE (Optional)                                                          |
|     +-------------------+                                                         |
|     | QuickSight        |  --> Dashboards, reports                                |
|     +-------------------+                                                         |
|                                                                                   |
|  All serverless, managed, pay-per-use!                                            |
+-----------------------------------------------------------------------------------+
```

# PHASE 1: DATA INGESTION

```
┌─────────────────────────────────────────────────────────────────────────────┐
│  STEP 1: INGEST DATA TO S3                                                  │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   Local Files ──────┐                                                       │
│                     │                                                       │
│   APIs/Streams ─────┼────►  S3 Bucket  ────►  raw/ folder                   │
│                     │      (Landing Zone)                                   │
│   Lambda Events ────┘                                                       │
│                                                                             │
│   Key Operations:                                                           │
│   • Create S3 bucket (landing zone)                                         │
│   • Upload raw data files                                                   │
│   • Verify uploads & read content                                           │
│   • Generate presigned URLs for sharing                                     │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

## 1.1 Initialize S3 Client

Configure the S3 client with **Signature Version 4** (required for presigned URLs) and regional endpoint.

In [29]:
# create an s3 client
from botocore.config import Config

# Configure S3 client with Signature Version 4 and regional endpoint
# Regional endpoint is required for presigned URLs to work correctly

s3_client = boto3.client('s3', 
                         endpoint_url=f'https://s3.{AWS_REGION}.amazonaws.com',
                         config=Config(signature_version='s3v4'),
                         region_name=AWS_REGION,
                         aws_secret_access_key=SECRET_KEY,
                         aws_access_key_id=ACCESS_KEY,
                         )

## 1.2 Create S3 Bucket (Landing Zone)

The bucket serves as the **landing zone** for all raw data. All incoming files go to the `raw/` prefix.

In [30]:
from botocore.exceptions import ClientError

def create_bucket(bucket_name, region=None):
    """
    Create an S3 bucket in a specified region
    
    Args:
        bucket_name (str): Name for the bucket (must be globally unique)
        region (str): AWS region (if None, uses default from client)
    
    Returns:
        bool: True if bucket created, False otherwise
    """
    try:
        if region is None or region == 'us-east-1':
            # us-east-1 doesn't require LocationConstraint
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            # Other regions require LocationConstraint
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={'LocationConstraint': region}
            )
        print(f"SUCCESS: Bucket '{bucket_name}' created successfully")
        return True
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'BucketAlreadyExists':
            print(f"ERROR: Bucket '{bucket_name}' already exists (owned by someone else)")
        elif error_code == 'BucketAlreadyOwnedByYou':
            print(f"INFO: Bucket '{bucket_name}' already exists and is owned by you")
        else:
            print(f"ERROR: Failed to create bucket - {e}")
        return False

# Example usage (uncomment to test):
create_bucket(S3_BUCKET_COMPLETE_PIPELINE, region=AWS_REGION)

INFO: Bucket 's3-complete-pipeline' already exists and is owned by you


False

## 1.3 Upload Raw Data

Upload files to the `raw/` folder. This is where Glue Crawlers will discover and catalog the data schema.

In [None]:
# upload a file to s3
def upload_file(file_name, bucket, object_name=None):
    """
    Upload a file to an S3 bucket
    
    Args:
        file_name (str): Path to file to upload
        bucket (str): Bucket name
        object_name (str): S3 object name (if None, uses file_name)
    
    Returns:
        bool: True if upload successful, False otherwise
    """
    # If S3 object_name not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)
    
    try:
        s3_client.upload_file(file_name, bucket, object_name)
        print(f"SUCCESS: '{file_name}' uploaded to '{bucket}/{object_name}'")
        return True
    except FileNotFoundError:
        print(f"ERROR: File '{file_name}' not found")
        return False
    except ClientError as e:
        print(f"ERROR: Failed to upload file - {e}")
        return False

# Example usage (uncomment to test):
upload_file('data/hosts.csv', S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')

SUCCESS: 'data/hosts.csv' uploaded to 's3-complete-pipeline/raw/hosts.csv'


True

## 1.4 Verify Upload (Read Content)

Read the uploaded file directly from S3 to confirm the data landed correctly.

In [32]:
def read_object(bucket, object_name):
    """
    Read S3 object content directly into memory
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name to read
    
    Returns:
        str: File content as string, or None if error
    """
    try:
        response = s3_client.get_object(Bucket=bucket, Key=object_name)
        content = response['Body'].read().decode('utf-8')
        print(f"SUCCESS: Read {len(content)} characters from '{bucket}/{object_name}'")
        return content
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchKey':
            print(f"ERROR: Object '{object_name}' not found in bucket '{bucket}'")
        else:
            print(f"ERROR: Failed to read object - {e}")
        return None
    except Exception as e:
        print(f"ERROR: Unexpected error - {e}")
        return None

# Example usage (uncomment to test):
content = read_object(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')
if content:
    print("Content:")
    print(content[:500])  # Print first 500 characters

SUCCESS: Read 13083 characters from 's3-complete-pipeline/raw/hosts.csv'
Content:
host_id,host_name,host_since,is_superhost,response_rate,created_at
1,Timothy Parker,2018-03-20,False,99,2025-12-26 14:15:54.011160
2,Hannah Evans,2024-01-01,False,95,2025-12-26 14:15:54.011160
3,Crystal Green,2016-08-06,False,74,2025-12-26 14:15:54.011160
4,Kevin Johnson,2020-02-25,False,100,2025-12-26 14:15:54.011160
5,Monica Johnson,2024-11-11,False,77,2025-12-26 14:15:54.011160
6,Nancy Turner,2016-11-03,False,96,2025-12-26 14:15:54.011160
7,Gerald Hunt,2022-01-21,True,99,2025-12-26 14:


## 1.5 List Objects in Bucket

View all objects in the `raw/` folder with detailed metadata (size, last modified, storage class).

In [33]:
from datetime import datetime

def list_objects_detailed(bucket, prefix=''):
    """
    List all objects in a bucket with detailed metadata
    
    Args:
        bucket (str): Bucket name
        prefix (str): Filter objects by prefix (folder path)
    
    Returns:
        list: List of object metadata dictionaries, or empty list if error
    """
    try:
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' not in response:
            print(f"No objects found in bucket '{bucket}' with prefix '{prefix}'")
            return []
        
        objects = []
        print(f"Objects in '{bucket}/{prefix}':")
        
        for obj in response['Contents']:
            # Convert size to human-readable format
            size_bytes = obj['Size']
            if size_bytes < 1024:
                size_str = f"{size_bytes} B"
            elif size_bytes < 1024**2:
                size_str = f"{size_bytes/1024:.2f} KB"
            else:
                size_str = f"{size_bytes/(1024**2):.2f} MB"
            
            # Format last modified date
            last_modified = obj['LastModified'].strftime('%Y-%m-%d %H:%M:%S')
            
            print(f"Key: {obj['Key']}")
            print(f"Size: {size_str} ({size_bytes:,} bytes)")
            print(f"Last Modified: {last_modified}")
            print(f"Storage Class: {obj.get('StorageClass', 'STANDARD')}")
            print(f"ETag: {obj['ETag']}")
            
            objects.append(obj)
        
        print(f"Total: {len(objects)} object(s)")
        return objects
        
    except ClientError as e:
        print(f"ERROR: Failed to list objects - {e}")
        return []

# Example usage (uncomment to test):
list_objects_detailed(S3_BUCKET_COMPLETE_PIPELINE, prefix=f'{S3_UPLOAD_FOLDER}')

Objects in 's3-complete-pipeline/raw/':
Key: raw/hosts.csv
Size: 12.78 KB (13,083 bytes)
Last Modified: 2026-01-31 03:02:32
Storage Class: STANDARD
ETag: "7588197a4f4c485949e7bfc641356122"
Total: 1 object(s)


[{'Key': 'raw/hosts.csv',
  'LastModified': datetime.datetime(2026, 1, 31, 3, 2, 32, tzinfo=tzutc()),
  'ETag': '"7588197a4f4c485949e7bfc641356122"',
  'ChecksumAlgorithm': ['CRC32'],
  'ChecksumType': 'FULL_OBJECT',
  'Size': 13083,
  'StorageClass': 'STANDARD'}]

## 1.6 Get Object Metadata

Retrieve detailed metadata for a specific object (content type, encryption, custom tags).

In [34]:
def get_object_metadata(bucket, object_name):
    """
    Retrieve metadata for an S3 object
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name (key)
    
    Returns:
        dict: Metadata dictionary, or None if error
    """
    try:
        response = s3_client.head_object(Bucket=bucket, Key=object_name)
        
        print(f"Metadata for '{bucket}/{object_name}':")
        
        # System metadata
        print("SYSTEM METADATA:")
        print(f"Content-Type: {response.get('ContentType', 'N/A')}")
        print(f"Content-Length: {response.get('ContentLength', 0):,} bytes")
        print(f"Last-Modified: {response.get('LastModified', 'N/A')}")
        print(f"ETag: {response.get('ETag', 'N/A')}")
        print(f"Storage-Class: {response.get('StorageClass', 'STANDARD')}")
        
        # User metadata (custom)
        user_metadata = response.get('Metadata', {})
        if user_metadata:
            print("USER METADATA (Custom):")
            for key, value in user_metadata.items():
                print(f"  {key}: {value}")
        else:
            print("USER METADATA: None")
        
        return response
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            print(f"ERROR: Object '{object_name}' not found in bucket '{bucket}'")
        else:
            print(f"ERROR: Failed to get metadata - {e}")
        return None

# Example usage (uncomment to test):
get_object_metadata(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv')

Metadata for 's3-complete-pipeline/raw/hosts.csv':
SYSTEM METADATA:
Content-Type: binary/octet-stream
Content-Length: 13,083 bytes
Last-Modified: 2026-01-31 03:02:32+00:00
ETag: "7588197a4f4c485949e7bfc641356122"
Storage-Class: STANDARD
USER METADATA: None


{'ResponseMetadata': {'RequestId': 'W1QD945F3JFZGWF0',
  'HostId': 'JDY8bqiBgPQQdc9mhRexUjWGk+L4itRZrVyrD+77LMFXx7seuBMPzgEeN6dAuj4WDzNYLB4zhws=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JDY8bqiBgPQQdc9mhRexUjWGk+L4itRZrVyrD+77LMFXx7seuBMPzgEeN6dAuj4WDzNYLB4zhws=',
   'x-amz-request-id': 'W1QD945F3JFZGWF0',
   'date': 'Sat, 31 Jan 2026 03:03:04 GMT',
   'last-modified': 'Sat, 31 Jan 2026 03:02:32 GMT',
   'etag': '"7588197a4f4c485949e7bfc641356122"',
   'x-amz-server-side-encryption': 'AES256',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '13083',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2026, 1, 31, 3, 2, 32, tzinfo=tzutc()),
 'ContentLength': 13083,
 'ETag': '"7588197a4f4c485949e7bfc641356122"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {}}

## 1.7 Generate Presigned URL (Share Data)

Create time-limited URLs for secure sharing without exposing AWS credentials.

| Expiration | Use Case |
|------------|----------|
| 600s (10 min) | Quick one-time downloads |
| 3600s (1 hour) | Team collaboration |
| 86400s (24 hours) | External sharing |
| 604800s (7 days) | Maximum allowed |

In [35]:
def generate_presigned_download_url(bucket, object_name, expiration=3600):
    """
    Generate a presigned URL for downloading an S3 object
    
    Args:
        bucket (str): Bucket name
        object_name (str): S3 object name (key)
        expiration (int): URL expiration time in seconds (default 3600 = 1 hour)
    
    Returns:
        str: Presigned URL, or None if error
    
    Common expiration times:
        - 3600 = 1 hour (default)
        - 7200 = 2 hours
        - 86400 = 24 hours
        - 604800 = 7 days (maximum)
    
    Note: Uses AWS Signature Version 4 (required by S3)
    """
    try:
        url = s3_client.generate_presigned_url(
            'get_object',
            Params={
                'Bucket': bucket,
                'Key': object_name
            },
            ExpiresIn=expiration
        )
        
        print(f"SUCCESS: Presigned URL generated for '{bucket}/{object_name}'")
        print(f"Expires in: {expiration} seconds ({expiration/3600:.1f} hours)")
        print(f"\nURL (valid for {expiration/3600:.1f} hours):")
        print(url)
        print("\nAnyone with this URL can download the file until it expires.")
        
        return url
        
    except ClientError as e:
        print(f"ERROR: Failed to generate presigned URL - {e}")
        return None


# Example usage (uncomment to test):
url = generate_presigned_download_url(S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}hosts.csv', expiration=600)

SUCCESS: Presigned URL generated for 's3-complete-pipeline/raw/hosts.csv'
Expires in: 600 seconds (0.2 hours)

URL (valid for 0.2 hours):
https://s3.us-east-2.amazonaws.com/s3-complete-pipeline/raw/hosts.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAWPVGU3OO6BQJAVE3%2F20260131%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20260131T030321Z&X-Amz-Expires=600&X-Amz-SignedHeaders=host&X-Amz-Signature=32c0c87db9fa04e5c387a2c5e06e1ff6d693ac6fa3b0b96553197a457e9e52ae

Anyone with this URL can download the file until it expires.


# PHASE 2: DATA CATALOG

```
┌─────────────────────────────────────────────────────────────────────────────┐
│  STEP 2: CATALOG DATA WITH GLUE CRAWLER                                    │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   S3 (raw/)  ────►  Glue Crawler  ────►  Data Catalog                       │
│                     (auto-schema)        (database + tables)                │
│                                                                             │
│   Key Operations:                                                           │
│   • Create Glue database (metadata container)                               │
│   • Create & run crawler (discovers schema from S3)                         │
│   • List tables (verify catalog entries)                                    │
│   • Query-ready tables for Athena/ETL                                       │
│                                                                             │
│   Crawler detects:                                                          │
│   • File format (CSV, JSON, Parquet, etc.)                                  │
│   • Column names and data types                                             │
│   • Partition structure                                                     │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

## 2.1 Initialize Glue Client

In [41]:
# Initialize Glue Client
glue_client = boto3.client(
    'glue',
    region_name=AWS_REGION,
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY
)

# Helper function for security
def redact_account_id(arn):
    """Redact AWS account ID from ARN"""
    return re.sub(r':\d{12}:', ':************:', str(arn))

print("Glue client initialized")

Glue client initialized


## 2.2 Create Glue Database

The database is a **logical container** for tables in the Data Catalog. Tables discovered by crawlers are stored here.

In [42]:
def create_glue_database(database_name, description=''):
    """
    Create a database in the AWS Glue Data Catalog
    
    Args:
        database_name (str): Name (lowercase, no spaces)
        description (str): Optional description
    
    Returns:
        bool: True if created successfully
    """
    try:
        print(f"Creating Glue database '{database_name}'...")
        
        glue_client.create_database(
            DatabaseInput={
                'Name': database_name,
                'Description': description
            }
        )
        
        print(f"SUCCESS: Database '{database_name}' created")
        return True
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'AlreadyExistsException':
            print(f"Database '{database_name}' already exists")
            return True
        print(f"ERROR: {e}")
        return False

# Test
create_glue_database('aws_full_pipeline_db', 'AWS Glue database for complete pipeline')

Creating Glue database 'aws_full_pipeline_db'...
Database 'aws_full_pipeline_db' already exists


True

## 2.3 List Databases

View all databases in the Glue Data Catalog.

In [43]:
def list_glue_databases():
    """
    List all databases in the Glue Data Catalog
    
    Returns:
        list: Database names
    """
    try:
        print("Listing Glue databases...")
        print("-" * 60)
        
        response = glue_client.get_databases()
        databases = response.get('DatabaseList', [])
        
        if not databases:
            print("No databases found")
            return []
        
        print(f"Found {len(databases)} database(s):\n")
        
        for db in databases:
            print(f"Database: {db['Name']}")
            print(f"  Description: {db.get('Description', 'N/A')}")
            print()
        
        return [db['Name'] for db in databases]
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return []

# Test
list_glue_databases()

Listing Glue databases...
------------------------------------------------------------
Found 4 database(s):

Database: aws_full_pipeline_db
  Description: AWS Glue database for complete pipeline

Database: data_engineering_db
  Description: Learning database

Database: default
  Description: Default Hive database

Database: glue_db
  Description: A glue databse



['aws_full_pipeline_db', 'data_engineering_db', 'default', 'glue_db']

## 2.4 Create Crawler

Crawlers automatically discover schema from S3 data and populate the Data Catalog with table definitions.

In [44]:
def create_glue_crawler(crawler_name, database_name, s3_path, description=''):
    """
    Create a Glue crawler
    
    Args:
        crawler_name (str): Crawler name
        database_name (str): Target database
        s3_path (str): S3 path to crawl (e.g., 's3://bucket/path/')
    """
    try:
        print(f"Creating crawler '{crawler_name}'...")
        print(f"Target: {database_name}")
        print(f"Path: {s3_path}")
        
        glue_client.create_crawler(
            Name=crawler_name,
            Role=GLUE_ROLE_ARN,
            DatabaseName=database_name,
            Description=description,
            Targets={'S3Targets': [{'Path': s3_path}]},
            SchemaChangePolicy={
                'UpdateBehavior': 'UPDATE_IN_DATABASE',
                'DeleteBehavior': 'LOG'
            }
        )
        
        print(f"SUCCESS: Crawler '{crawler_name}' created")
        return True
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'AlreadyExistsException':
            print(f"Crawler '{crawler_name}' already exists")
            return True
        print(f"ERROR: {e}")
        return False

# Example
create_glue_crawler('full_pipeline_crawler', 'aws_full_pipeline_db', f's3://{S3_BUCKET_COMPLETE_PIPELINE}/raw/')

Creating crawler 'full_pipeline_crawler'...
Target: aws_full_pipeline_db
Path: s3://s3-complete-pipeline/raw/
Crawler 'full_pipeline_crawler' already exists


True

## 2.5 Run Crawler

Execute the crawler to scan S3 and update the Data Catalog. Use `wait=True` to block until completion.

In [45]:
def run_crawler(crawler_name, wait=False):
    """
    Start a Glue crawler
    
    Args:
        crawler_name (str): Crawler name
        wait (bool): Wait for completion
    """
    try:
        print(f"Starting crawler '{crawler_name}'...")
        glue_client.start_crawler(Name=crawler_name)
        print(f"Crawler started")
        
        if wait:
            print("Waiting for completion...")
            while True:
                response = glue_client.get_crawler(Name=crawler_name)
                state = response['Crawler']['State']
                
                if state == 'READY':
                    # Wait for stats to update, then re-fetch
                    time.sleep(2)
                    response = glue_client.get_crawler(Name=crawler_name)
                    last = response['Crawler'].get('LastCrawl', {})
                    print(f"Completed: {last.get('Status', 'Unknown')}")
                    break
                print(f"  State: {state}")
                time.sleep(10)
        
        return True
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'CrawlerRunningException':
            print("Crawler already running")
            return True
        print(f"ERROR: {e}")
        return False

# Example
run_crawler('full_pipeline_crawler', wait=True)

Starting crawler 'full_pipeline_crawler'...
Crawler started
Waiting for completion...
  State: RUNNING
  State: RUNNING
  State: RUNNING
  State: RUNNING
  State: RUNNING
Completed: SUCCEEDED


True

## 2.6 List Tables (Verify Catalog)

**Critical step** - Verify the crawler created tables in the Data Catalog. These tables are now queryable by Athena.

In [46]:
def list_tables(database_name):
    """
    List all tables in a Glue database
    
    Args:
        database_name (str): Database name
    
    Returns:
        list: Table information
    """
    try:
        print(f"Tables in database '{database_name}':")
        print("-" * 60)
        
        response = glue_client.get_tables(DatabaseName=database_name)
        tables = response.get('TableList', [])
        
        if not tables:
            print("No tables found")
            return []
        
        for table in tables:
            print(f"\nTable: {table['Name']}")
            print(f"  Location: {table.get('StorageDescriptor', {}).get('Location', 'N/A')}")
            print(f"  Format: {table.get('Parameters', {}).get('classification', 'N/A')}")
            
            # Show columns
            columns = table.get('StorageDescriptor', {}).get('Columns', [])
            if columns:
                print(f"  Columns ({len(columns)}):")
                for col in columns[:5]:  # Show first 5
                    print(f"    - {col['Name']}: {col['Type']}")
                if len(columns) > 5:
                    print(f"    ... and {len(columns) - 5} more")
        
        print(f"\nTotal: {len(tables)} table(s)")
        return tables
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return []

# Verify crawler created tables
list_tables('aws_full_pipeline_db')

Tables in database 'aws_full_pipeline_db':
------------------------------------------------------------

Table: raw
  Location: s3://s3-complete-pipeline/raw/
  Format: csv
  Columns (6):
    - host_id: bigint
    - host_name: string
    - host_since: string
    - is_superhost: boolean
    - response_rate: bigint
    ... and 1 more

Total: 1 table(s)


[{'Name': 'raw',
  'DatabaseName': 'aws_full_pipeline_db',
  'Owner': 'owner',
  'CreateTime': datetime.datetime(2026, 1, 30, 22, 12, 26, tzinfo=tzlocal()),
  'UpdateTime': datetime.datetime(2026, 1, 30, 22, 12, 26, tzinfo=tzlocal()),
  'LastAccessTime': datetime.datetime(2026, 1, 30, 22, 12, 26, tzinfo=tzlocal()),
  'Retention': 0,
  'StorageDescriptor': {'Columns': [{'Name': 'host_id', 'Type': 'bigint'},
    {'Name': 'host_name', 'Type': 'string'},
    {'Name': 'host_since', 'Type': 'string'},
    {'Name': 'is_superhost', 'Type': 'boolean'},
    {'Name': 'response_rate', 'Type': 'bigint'},
    {'Name': 'created_at', 'Type': 'string'}],
   'Location': 's3://s3-complete-pipeline/raw/',
   'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
   'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
   'Compressed': False,
   'NumberOfBuckets': -1,
   'SerdeInfo': {'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
    'Parameters':

## 2.7 List Crawlers

View all crawlers in your account with their current state.

In [47]:
def list_crawlers():
    """
    List all Glue crawlers
    
    Returns:
        list: Crawler names and states
    """
    try:
        print("Glue Crawlers:")
        print("-" * 60)
        
        response = glue_client.get_crawlers()
        crawlers = response.get('Crawlers', [])
        
        if not crawlers:
            print("No crawlers found")
            return []
        
        for crawler in crawlers:
            print(f"\nCrawler: {crawler['Name']}")
            print(f"  State: {crawler['State']}")
            print(f"  Database: {crawler.get('DatabaseName', 'N/A')}")
            
            # Last crawl info
            last = crawler.get('LastCrawl', {})
            if last:
                print(f"  Last Run: {last.get('Status', 'N/A')}")
                print(f"  Tables Created: {last.get('TablesCreated', 0)}")
                print(f"  Tables Updated: {last.get('TablesUpdated', 0)}")
        
        print(f"\nTotal: {len(crawlers)} crawler(s)")
        return crawlers
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return []

# List all crawlers
list_crawlers()

Glue Crawlers:
------------------------------------------------------------

Crawler: db_s3_crawler
  State: READY
  Database: glue_db
  Last Run: SUCCEEDED
  Tables Created: 0
  Tables Updated: 0

Crawler: full_pipeline_crawler
  State: READY
  Database: aws_full_pipeline_db
  Last Run: SUCCEEDED
  Tables Created: 0
  Tables Updated: 0

Crawler: my-crawler
  State: READY
  Database: data_engineering_db
  Last Run: SUCCEEDED
  Tables Created: 0
  Tables Updated: 0

Total: 3 crawler(s)


[{'Name': 'db_s3_crawler',
  'Role': 'glue-access-s3',
  'Targets': {'S3Targets': [{'Path': 's3://real-learn-s3/processed/',
     'Exclusions': []}],
   'JdbcTargets': [],
   'MongoDBTargets': [],
   'DynamoDBTargets': [],
   'CatalogTargets': [],
   'DeltaTargets': [],
   'IcebergTargets': [],
   'HudiTargets': []},
  'DatabaseName': 'glue_db',
  'Classifiers': [],
  'RecrawlPolicy': {'RecrawlBehavior': 'CRAWL_EVERYTHING'},
  'SchemaChangePolicy': {'UpdateBehavior': 'UPDATE_IN_DATABASE',
   'DeleteBehavior': 'DEPRECATE_IN_DATABASE'},
  'LineageConfiguration': {'CrawlerLineageSettings': 'DISABLE'},
  'State': 'READY',
  'CrawlElapsedTime': 0,
  'CreationTime': datetime.datetime(2026, 1, 30, 0, 15, 47, tzinfo=tzlocal()),
  'LastUpdated': datetime.datetime(2026, 1, 30, 0, 15, 47, tzinfo=tzlocal()),
  'LastCrawl': {'Status': 'SUCCEEDED',
   'ErrorMessage': 'Service Principal: glue.amazonaws.com is not authorized to perform: logs:PutLogEvents on resource: arn:aws:logs:us-east-2:44595235113

## 2.8 Cleanup Functions

Delete crawlers and databases when no longer needed. **Use with caution!**

In [48]:
def delete_crawler(crawler_name):
    """
    Delete a Glue crawler
    
    Args:
        crawler_name (str): Crawler name
    """
    try:
        print(f"Deleting crawler '{crawler_name}'...")
        glue_client.delete_crawler(Name=crawler_name)
        print(f"SUCCESS: Crawler '{crawler_name}' deleted")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            print(f"Crawler '{crawler_name}' not found")
        else:
            print(f"ERROR: {e}")
        return False


def delete_database(database_name):
    """
    Delete a Glue database and all its tables
    
    Args:
        database_name (str): Database name
    """
    try:
        print(f"Deleting database '{database_name}'...")
        glue_client.delete_database(Name=database_name)
        print(f"SUCCESS: Database '{database_name}' deleted")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            print(f"Database '{database_name}' not found")
        else:
            print(f"ERROR: {e}")
        return False


# Example (uncomment to use):
# delete_crawler('full_pipeline_crawler')
# delete_database('aws_full_pipeline_db')

# PHASE 3: DATA TRANSFORMATION (ETL)

```
┌─────────────────────────────────────────────────────────────────────────────┐
│  STEP 3: TRANSFORM DATA WITH GLUE ETL JOB                                  │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                             │
│   S3 (raw/)  ────►  Glue ETL Job  ────►  S3 (processed/)                    │
│   (CSV)             (Spark/Python)       (Parquet)                          │
│                                                                             │
│   Key Operations:                                                           │
│   • Upload ETL script to S3                                                 │
│   • Create Glue job pointing to script                                      │
│   • Run job with parameters                                                 │
│   • Monitor job execution                                                   │
│                                                                             │
│   Transformations in aws_glue_etl.py:                                       │
│   • Null value checking & rejection                                         │
│   • Duplicate detection & handling                                          │
│   • Calculated columns:                                                     │
│     - total_booking_amount = nights_booked * booking_amount                 │
│     - additional_cost = cleaning_fee + service_fee                          │
│     - total_cost = total_booking_amount + additional_cost                   │
│   • Output as Parquet (compressed, columnar)                                │
│                                                                             │
└─────────────────────────────────────────────────────────────────────────────┘
```

## 3.1 Upload Raw Data (bookings.csv)

In [49]:
upload_file('data/bookings.csv', S3_BUCKET_COMPLETE_PIPELINE, f'{S3_UPLOAD_FOLDER}bookings.csv')

SUCCESS: 'data/bookings.csv' uploaded to 's3-complete-pipeline/raw/bookings.csv'


True

## 3.2 Upload ETL Script to S3

Glue jobs require the Python script to be stored in S3.

In [55]:
# upload the ETL script to S3
upload_file('scripts/aws_glue_etl.py', S3_BUCKET_COMPLETE_PIPELINE, 'scripts/aws_glue_etl.py')

SUCCESS: 'scripts/aws_glue_etl.py' uploaded to 's3-complete-pipeline/scripts/aws_glue_etl.py'


True

## 3.3 Create Glue Job

Create an ETL job that points to the script in S3. Job types:

| Type | Use Case | Workers |
|------|----------|---------|
| `glueetl` | Spark-based, large datasets | 2+ DPUs |
| `pythonshell` | Simple Python, small data | 0.0625-1 DPU |

In [56]:
def create_glue_job(job_name, script_location, description='', job_type='glueetl', 
                    worker_type='G.1X', num_workers=2, timeout=60, max_retries=0):
    """
    Create a Glue ETL job
    
    Args:
        job_name (str): Job name
        script_location (str): S3 path to ETL script (s3://bucket/path/script.py)
        description (str): Job description
        job_type (str): 'glueetl' (Spark) or 'pythonshell'
        worker_type (str): 'G.1X', 'G.2X', 'G.025X' (for pythonshell)
        num_workers (int): Number of workers (min 2 for glueetl)
        timeout (int): Job timeout in minutes
        max_retries (int): Number of retries on failure
    
    Returns:
        bool: True if created successfully
    """
    try:
        print(f"Creating Glue job '{job_name}'...")
        print(f"  Script: {script_location}")
        print(f"  Type: {job_type}")
        
        job_config = {
            'Name': job_name,
            'Description': description,
            'Role': GLUE_ROLE_ARN,
            'Command': {
                'Name': job_type,
                'ScriptLocation': script_location,
                'PythonVersion': '3'
            },
            'DefaultArguments': {
                '--job-language': 'python',
                '--enable-metrics': 'true',
                '--enable-continuous-cloudwatch-log': 'true'
            },
            'Timeout': timeout,
            'MaxRetries': max_retries,
            'GlueVersion': '4.0'
        }
        
        # Add worker config for glueetl jobs
        if job_type == 'glueetl':
            job_config['WorkerType'] = worker_type
            job_config['NumberOfWorkers'] = num_workers
        
        glue_client.create_job(**job_config)
        
        print(f"SUCCESS: Job '{job_name}' created")
        return True
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'AlreadyExistsException':
            print(f"Job '{job_name}' already exists")
            return True
        print(f"ERROR: {e}")
        return False


# Create the bookings ETL job
script_path = f's3://{S3_BUCKET_COMPLETE_PIPELINE}/scripts/aws_glue_etl.py'
create_glue_job(
    job_name='bookings-etl-job',
    script_location=script_path,
    description='Transform bookings data: null check, dedup, calculate totals'
)

Creating Glue job 'bookings-etl-job'...
  Script: s3://s3-complete-pipeline/scripts/aws_glue_etl.py
  Type: glueetl
SUCCESS: Job 'bookings-etl-job' created


True

## 3.4 Run Glue Job

Execute the job with parameters. The job reads from `raw/bookings.csv` and writes to `processed/`.

In [57]:
def run_glue_job(job_name, arguments=None, wait=False):
    """
    Start a Glue job run
    
    Args:
        job_name (str): Job name
        arguments (dict): Job arguments (e.g., {'--S3_BUCKET': 'my-bucket'})
        wait (bool): Wait for completion
    
    Returns:
        str: Job run ID, or None if error
    """
    try:
        print(f"Starting job '{job_name}'...")
        
        run_config = {'JobName': job_name}
        if arguments:
            run_config['Arguments'] = arguments
            print(f"  Arguments: {arguments}")
        
        response = glue_client.start_job_run(**run_config)
        run_id = response['JobRunId']
        
        print(f"Job started. Run ID: {run_id}")
        
        if wait:
            print("Waiting for completion...")
            while True:
                status_response = glue_client.get_job_run(JobName=job_name, RunId=run_id)
                state = status_response['JobRun']['JobRunState']
                
                if state in ['SUCCEEDED', 'FAILED', 'STOPPED', 'TIMEOUT']:
                    print(f"Job {state}")
                    if state == 'FAILED':
                        error = status_response['JobRun'].get('ErrorMessage', 'Unknown')
                        print(f"Error: {error}")
                    elif state == 'SUCCEEDED':
                        duration = status_response['JobRun'].get('ExecutionTime', 0)
                        print(f"Duration: {duration} seconds")
                    break
                    
                print(f"  State: {state}")
                time.sleep(30)
        
        return run_id
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return None


# Run the bookings ETL job
run_glue_job(
    job_name='bookings-etl-job',
    arguments={
        '--S3_BUCKET': S3_BUCKET_COMPLETE_PIPELINE,
        '--SOURCE_PREFIX': 'raw',
        '--TARGET_PREFIX': 'processed',
        '--DUPLICATE_HANDLING': 'keep_first'
    },
    wait=True
)

Starting job 'bookings-etl-job'...
  Arguments: {'--S3_BUCKET': 's3-complete-pipeline', '--SOURCE_PREFIX': 'raw', '--TARGET_PREFIX': 'processed', '--DUPLICATE_HANDLING': 'keep_first'}
Job started. Run ID: jr_96843101dee5f3eecd5d11029c418eea5ebb7b886849f9024d76c2ca5a312d2b
Waiting for completion...
  State: RUNNING
  State: RUNNING
  State: RUNNING
Job SUCCEEDED
Duration: 74 seconds


'jr_96843101dee5f3eecd5d11029c418eea5ebb7b886849f9024d76c2ca5a312d2b'

## 3.5 Get Job Run Status

Check the status of a specific job run.

In [58]:
def get_job_run_status(job_name, run_id=None):
    """
    Get status of a job run (latest if run_id not specified)
    
    Args:
        job_name (str): Job name
        run_id (str): Specific run ID (optional)
    
    Returns:
        dict: Job run details
    """
    try:
        if run_id:
            response = glue_client.get_job_run(JobName=job_name, RunId=run_id)
            runs = [response['JobRun']]
        else:
            response = glue_client.get_job_runs(JobName=job_name, MaxResults=1)
            runs = response.get('JobRuns', [])
        
        if not runs:
            print(f"No runs found for job '{job_name}'")
            return None
        
        run = runs[0]
        print(f"Job Run Status for '{job_name}':")
        print("-" * 50)
        print(f"  Run ID: {run['Id']}")
        print(f"  State: {run['JobRunState']}")
        print(f"  Started: {run.get('StartedOn', 'N/A')}")
        print(f"  Completed: {run.get('CompletedOn', 'N/A')}")
        print(f"  Duration: {run.get('ExecutionTime', 0)} seconds")
        
        if run['JobRunState'] == 'FAILED':
            print(f"  Error: {run.get('ErrorMessage', 'Unknown')}")
        
        return run
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return None


# Check latest run status
get_job_run_status('bookings-etl-job')

Job Run Status for 'bookings-etl-job':
--------------------------------------------------
  Run ID: jr_96843101dee5f3eecd5d11029c418eea5ebb7b886849f9024d76c2ca5a312d2b
  State: SUCCEEDED
  Started: 2026-01-30 23:06:44.458000-05:00
  Completed: 2026-01-30 23:08:05.023000-05:00
  Duration: 74 seconds


{'Id': 'jr_96843101dee5f3eecd5d11029c418eea5ebb7b886849f9024d76c2ca5a312d2b',
 'Attempt': 0,
 'JobName': 'bookings-etl-job',
 'JobMode': 'SCRIPT',
 'JobRunQueuingEnabled': False,
 'StartedOn': datetime.datetime(2026, 1, 30, 23, 6, 44, 458000, tzinfo=tzlocal()),
 'LastModifiedOn': datetime.datetime(2026, 1, 30, 23, 8, 5, 23000, tzinfo=tzlocal()),
 'CompletedOn': datetime.datetime(2026, 1, 30, 23, 8, 5, 23000, tzinfo=tzlocal()),
 'JobRunState': 'SUCCEEDED',
 'Arguments': {'--TARGET_PREFIX': 'processed',
  '--DUPLICATE_HANDLING': 'keep_first',
  '--S3_BUCKET': 's3-complete-pipeline',
  '--SOURCE_PREFIX': 'raw'},
 'PredecessorRuns': [],
 'AllocatedCapacity': 2,
 'ExecutionTime': 74,
 'Timeout': 60,
 'MaxCapacity': 2.0,
 'WorkerType': 'G.1X',
 'NumberOfWorkers': 2,
 'LogGroupName': '/aws-glue/jobs',
 'GlueVersion': '4.0'}

## 3.6 List Jobs

View all Glue jobs in the account.

In [59]:
def list_jobs():
    """
    List all Glue jobs
    
    Returns:
        list: Job details
    """
    try:
        print("Glue Jobs:")
        print("-" * 60)
        
        response = glue_client.get_jobs()
        jobs = response.get('Jobs', [])
        
        if not jobs:
            print("No jobs found")
            return []
        
        for job in jobs:
            print(f"\nJob: {job['Name']}")
            print(f"  Description: {job.get('Description', 'N/A')}")
            print(f"  Type: {job['Command']['Name']}")
            print(f"  Glue Version: {job.get('GlueVersion', 'N/A')}")
            print(f"  Workers: {job.get('NumberOfWorkers', 'N/A')}")
            print(f"  Timeout: {job.get('Timeout', 'N/A')} min")
        
        print(f"\nTotal: {len(jobs)} job(s)")
        return jobs
        
    except ClientError as e:
        print(f"ERROR: {e}")
        return []


# List all jobs
list_jobs()

Glue Jobs:
------------------------------------------------------------

Job: aws-glue-pipeline
  Description: 
  Type: glueetl
  Glue Version: 5.0
  Workers: 10
  Timeout: 480 min

Job: bookings-etl-job
  Description: Transform bookings data: null check, dedup, calculate totals
  Type: glueetl
  Glue Version: 4.0
  Workers: 2
  Timeout: 60 min

Job: my-etl-job
  Description: Test ETL job
  Type: pythonshell
  Glue Version: 3.0
  Workers: N/A
  Timeout: 60 min

Total: 3 job(s)


[{'Name': 'aws-glue-pipeline',
  'JobMode': 'VISUAL',
  'JobRunQueuingEnabled': False,
  'Description': '',
  'Role': 'arn:aws:iam::445952351133:role/glue-access-s3',
  'CreatedOn': datetime.datetime(2026, 1, 30, 0, 28, 24, 730000, tzinfo=tzlocal()),
  'LastModifiedOn': datetime.datetime(2026, 1, 30, 0, 43, 37, 889000, tzinfo=tzlocal()),
  'ExecutionProperty': {'MaxConcurrentRuns': 1},
  'Command': {'Name': 'glueetl',
   'ScriptLocation': 's3://aws-glue-assets-445952351133-us-east-2/scripts/aws-glue-pipeline.py',
   'PythonVersion': '3'},
  'DefaultArguments': {'--enable-metrics': 'true',
   '--enable-spark-ui': 'true',
   '--extra-py-files': 's3://aws-glue-studio-transforms-251189692203-prod-us-east-2/gs_common.py,s3://aws-glue-studio-transforms-251189692203-prod-us-east-2/gs_derived.py',
   '--spark-event-logs-path': 's3://aws-glue-assets-445952351133-us-east-2/sparkHistoryLogs/',
   '--enable-job-insights': 'true',
   '--enable-observability-metrics': 'true',
   '--conf': 'spark.sql

## 3.7 Verify Output

Check the processed folder to confirm Parquet files were created.

In [60]:
# Verify processed data was created
list_objects_detailed(S3_BUCKET_COMPLETE_PIPELINE, prefix='processed/')

Objects in 's3-complete-pipeline/processed/':
Key: processed/bookings/part-00000-42d4a9f1-0e64-465d-8bdb-129f63975069-c000.snappy.parquet
Size: 262.24 KB (268,529 bytes)
Last Modified: 2026-01-31 04:07:54
Storage Class: STANDARD
ETag: "99d0150fe63b8500442e47b32d495d0e-1"
Total: 1 object(s)


[{'Key': 'processed/bookings/part-00000-42d4a9f1-0e64-465d-8bdb-129f63975069-c000.snappy.parquet',
  'LastModified': datetime.datetime(2026, 1, 31, 4, 7, 54, tzinfo=tzutc()),
  'ETag': '"99d0150fe63b8500442e47b32d495d0e-1"',
  'ChecksumAlgorithm': ['CRC64NVME'],
  'ChecksumType': 'FULL_OBJECT',
  'Size': 268529,
  'StorageClass': 'STANDARD'}]

## 3.8 Cleanup Functions

Delete jobs when no longer needed. **Use with caution!**

In [62]:
def delete_job(job_name):
    """
    Delete a Glue job
    
    Args:
        job_name (str): Job name
    """
    try:
        print(f"Deleting job '{job_name}'...")
        glue_client.delete_job(JobName=job_name)
        print(f"SUCCESS: Job '{job_name}' deleted")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityNotFoundException':
            print(f"Job '{job_name}' not found")
        else:
            print(f"ERROR: {e}")
        return False


# Example (uncomment to use):
delete_job('bookings-etl-job')
delete_job('my-etl-job')
delete_job('aws-glue-pipeline')

Deleting job 'bookings-etl-job'...
SUCCESS: Job 'bookings-etl-job' deleted
Deleting job 'my-etl-job'...
SUCCESS: Job 'my-etl-job' deleted
Deleting job 'aws-glue-pipeline'...
SUCCESS: Job 'aws-glue-pipeline' deleted


True