In [19]:
!pip install boto3



In [20]:
import boto3
import re

# AWS Region
AWS_REGION = "us-east-1"

# Initialize AWS Clients
s3 = boto3.client("s3", region_name=AWS_REGION)
sns = boto3.client("sns", region_name=AWS_REGION)

# ✅ Hardcoded SNS Topic Name
SNS_TOPIC_NAME = "email-path-distribution-topic"

In [21]:
# ✅ Input: Full S3 URL
S3_URL = "s3://email-sec-datasets-bronze/email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/"


In [22]:
# ✅ Function to Extract Bucket Name & Prefix from S3 URL
def parse_s3_url(s3_url):
    """Extracts bucket name and prefix from an S3 URL."""
    match = re.match(r"s3://([^/]+)/(.+)", s3_url)
    if match:
        return match.group(1), match.group(2)
    return None, None

# ✅ List All File Paths in S3 Directory
def list_s3_files(bucket_name, prefix):
    """Lists all files inside the given S3 directory (prefix)."""
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    files = [obj["Key"] for obj in response.get("Contents", [])] if "Contents" in response else []
    
    if not files:
        print("⚠️ No files found in the directory.")
    else:
        print(f"✅ Found {len(files)} files in '{prefix}':\n")
        for file in files[:10]:  # Display only first 10 files for brevity
            print(f"- {file} (and more...)")
    
    return files

# ✅ Fetch SNS Topic ARN by Name
def get_sns_topic_arn(topic_name):
    """Fetch SNS Topic ARN by topic name."""
    response = sns.list_topics()
    for topic in response.get("Topics", []):
        if topic_name in topic["TopicArn"]:
            return topic["TopicArn"]
    return None

# ✅ Publish File Paths to SNS
def publish_to_sns(topic_arn, file_path):
    """Publishes file path to SNS topic."""
    try:
        sns.publish(TopicArn=topic_arn, Message=file_path)
        print(f"✅ Sent to SNS: {file_path}")
    except Exception as e:
        print(f"❌ Failed to send {file_path} to SNS: {str(e)}")

# ✅ Extract Bucket & Prefix
S3_BUCKET_NAME, DIRECTORY_PREFIX = parse_s3_url(S3_URL)

if not S3_BUCKET_NAME or not DIRECTORY_PREFIX:
    print("❌ Invalid S3 URL. Please enter a valid S3 URL.")
else:
    print(f"✅ Extracted S3 Bucket: {S3_BUCKET_NAME}")
    print(f"✅ Extracted S3 Directory Prefix: {DIRECTORY_PREFIX}")

    # ✅ Fetch SNS Topic ARN
    SNS_TOPIC_ARN = get_sns_topic_arn(SNS_TOPIC_NAME)

    if not SNS_TOPIC_ARN:
        print(f"❌ SNS Topic '{SNS_TOPIC_NAME}' not found in {AWS_REGION}.")
    else:
        print(f"✅ Fetched SNS Topic ARN: {SNS_TOPIC_ARN}")

        # ✅ List Files in S3 Directory
        file_paths = list_s3_files(S3_BUCKET_NAME, DIRECTORY_PREFIX)

        # ✅ Publish Each File Path to SNS
        for file_path in file_paths:
            publish_to_sns(SNS_TOPIC_ARN, file_path)


✅ Extracted S3 Bucket: email-sec-datasets-bronze
✅ Extracted S3 Directory Prefix: email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/
✅ Fetched SNS Topic ARN: arn:aws:sns:us-east-1:221082192243:email-path-distribution-topic
✅ Found 66 files in 'email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/':

- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/1. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/10. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/11. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/12. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/13. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/maildir/allen-p/inbox/14. (and more...)
- email-sec-datasets-bronze/extracted/enron_mail_20150507/m