In [None]:
import math
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import boto3
from pydantic import BaseModel, Field, ValidationError
from datetime import datetime
from typing import Optional

####  Set up and start your interactive session


In [None]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Get parameters from previous job

In [None]:
args = getResolvedOptions(sys.argv, ['bucket_name', 'object'])
print(args['bucket_name'])
print(args['object'])

#### Schema Enforcement using Pydantic

In [None]:
# Define the schema for Chicago Crime data
class ChicagoCrimeSchema(BaseModel):
    ID: Optional[int]
    Case_Number: Optional[str]
    Date: Optional[datetime]
    Block: Optional[str]
    IUCR: Optional[str]
    Primary_Type: Optional[str]
    Description: Optional[str]
    Location_Description: Optional[str]
    Arrest: Optional[bool]
    Domestic: Optional[bool]
    Beat: Optional[int]
    District: Optional[int]
    Ward: Optional[int]
    Community_Area: Optional[int]
    FBI_Code: Optional[str]
    X_Coordinate: Optional[int]
    Y_Coordinate: Optional[int]
    Year: Optional[int]
    Updated_On: Optional[datetime]
    Latitude: Optional[float]
    Longitude: Optional[float]
    Location: Optional[str]

#### Download Raw data, address some data inconsistencies and send it back to S3

In [None]:
s3 = boto3.client('s3')

# Extract file and bucket information from the Lambda event
file_name = args['object'].split('/')[-1]
bucket_name = args['bucket_name']

# Download the file from S3
s3.download_file(bucket_name, args['object'], f'{file_name}')

# Read the CSV file and handle data types
df = pd.read_csv(f'{file_name}', parse_dates=['Date', 'Updated On'], low_memory=False)
df.replace({float('nan'): None}, inplace=True)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%m/%d/%Y %I:%M:%S %p')
df['Updated On'] = pd.to_datetime(df['Updated On'], errors='coerce', format='%m/%d/%Y %I:%M:%S %p')

# Validate each row against the schema
invalid_rows = []
for index, row in df.iterrows():
    row_dict = row.rename(lambda x: x.replace(' ', '_').replace('-', '_')).to_dict()
    try:
        ChicagoCrimeSchema(**row_dict)
    except ValidationError:
        print(row_dict)
        invalid_rows.append(index)

# Remove invalid rows
df.drop(index=invalid_rows, inplace=True)

# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(df)

# Save the table as a Parquet file
pq.write_table(table, '/tmp/preprocessed_chicago_crime_data.parquet')

# Upload the Parquet file back to S3
s3.upload_file('/tmp/preprocessed_chicago_crime_data.parquet', 
               bucket_name, 
               'data/chicago_crime_database/chicago_crime_parquet/preprocessed_chicago_crime_data.parquet')