In [None]:
# Lambda to start the glue job

In [None]:
import boto3

glue_client = boto3.client('glue')

response = glue_client.start_job_run(
    JobName='your_glue_job_name',
    Arguments={
        '--JOB_NAME': 'your_glue_job_name',
        '--INPUT_PATH': 's3://your-bucket/path/to/your/file.csv',
        '--OUTPUT_PATH': 's3://your-bucket/path/to/output/',
        '--ARCHIVE_PATH': 's3://your-bucket/path/to/archive/',
        '--ERROR_PATH': 's3://your-bucket/path/to/error/'
    }
)


In [None]:
# Glue job

In [None]:
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

# Retrieve parameters
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'INPUT_PATH', 'OUTPUT_PATH', 'ARCHIVE_PATH', 'ERROR_PATH'])

# Initialize clients and context
s3_client = boto3.client('s3')
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

try:
    # Read CSV into DataFrame
    df = spark.read.options(header=True, inferSchema=True).csv(args['INPUT_PATH'])
    
    # Filter DataFrame for specific make
    df_by_make = df.filter(df['make'] == 'PORSCHE')
    
    # Write filtered DataFrame to Parquet
    df_by_make.write.mode('overwrite').parquet(args['OUTPUT_PATH'])

    # Archive the original CSV file
    s3_client.copy_object(
        Bucket=args['ARCHIVE_PATH'].split('/')[2],  # Extract bucket name from the path
        CopySource={'Bucket': args['ARCHIVE_PATH'].split('/')[2], 
                    'Key': args['ARCHIVE_PATH'].split('/', 3)[-1]},  # Extract the key from the path
        Key=args['ARCHIVE_PATH'].split('/', 3)[-1]  # Archive path
    )

    # Delete the original CSV file
    s3_client.delete_object(Bucket=args['INPUT_PATH'].split('/')[2], Key=args['INPUT_PATH'].split('/', 3)[-1])

except Exception as e:
    # Log the error (optional)
    print(f"An error occurred: {str(e)}")

    # Archive the original CSV file in the error directory
    s3_client.copy_object(
        Bucket=args['ERROR_PATH'].split('/')[2],  # Extract bucket name from the path
        CopySource={'Bucket': args['ERROR_PATH'].split('/')[2], 
                    'Key': args['ERROR_PATH'].split('/', 3)[-1]},  # Extract the key from the path
        Key=args['ERROR_PATH'].split('/', 3)[-1]  # Error path
    )

    # Delete the original CSV file in case of an error
    s3_client.delete_object(Bucket=args['INPUT_PATH'].split('/')[2], Key=args['INPUT_PATH'].split('/', 3)[-1])

finally:
    # Commit the Glue job
    job.commit()
