In [20]:
#check ipynb is running
print("Hello, World!")

Hello, World!


In [21]:
#set working directory
import os
os.chdir("/Users/jovita.brundziene/Python/airflow-de-intro-project-jbru")

In [22]:
#check working directory set to project root to use relative pathways later
os.getcwd()

'/Users/jovita.brundziene/Python/airflow-de-intro-project-jbru'

To do:
- Go through repo steps
- Include dev/prod environment parameters
- add parameters to config file
- create a docker image
- create a github action to run pipeline automatically
- create unit tests
- modularise code into at least config, functions and run
- Update requirements file and build it into the script
- Requirements lint?
- Nice to have: package it up as a python package?

### Extract data from local to S3

In [23]:
import boto3
import os
import logging
from botocore.exceptions import ClientError

# Set up logging configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def upload_parquet_files_to_s3(bucket_name, local_directory, s3_prefix, dry_run=True):
    """
    Uploads .parquet files from a local directory to an S3 bucket under a specified prefix.

    Parameters:
        bucket_name (str): Name of the S3 bucket.
        local_directory (str): Path to the local directory containing .parquet files.
        s3_prefix (str): Path prefix within the S3 bucket.
        dry_run (bool): If True, simulates the upload without actually uploading files.
    """
    # Create an S3 client using boto3
    s3 = boto3.client('s3')

    # Loop through all files in the specified local directory
    for file in os.listdir(local_directory):
        # Only process files with a .parquet extension
        if file.endswith('.parquet'):
            # Construct the full local file path
            local_path = os.path.join(local_directory, file)
            # Define the S3 object key (i.e., path within the bucket)
            s3_key = f'{s3_prefix}/{file}'

            try:
                # Check if the file already exists in the S3 bucket
                s3.head_object(Bucket=bucket_name, Key=s3_key)
                logging.info(f"File already exists in S3: s3://{bucket_name}/{s3_key} — skipping upload.")
            except ClientError as e:
                # If the error code is 404, the file does not exist — proceed with upload
                if e.response['Error']['Code'] == '404':
                    if dry_run:
                        # Simulate the upload in dry run mode
                        print(f"[DRY RUN] Would upload: {local_path} to s3://{bucket_name}/{s3_key}")
                    else:
                        # Attempt to upload the file to S3
                        try:
                            s3.upload_file(local_path, bucket_name, s3_key)
                            logging.info(f"Successfully uploaded: {local_path} to s3://{bucket_name}/{s3_key}")
                        except Exception as upload_error:
                            logging.error(f"Failed to upload: {local_path}. Error: {upload_error}")
                else:
                    # Log unexpected errors during head_object check
                    logging.error(f"Error checking existence of {s3_key}: {e}")


In [24]:

#turn this into a config file
upload_parquet_files_to_s3(
    bucket_name='alpha-hmcts-de-testing-sandbox',
    local_directory='data/example-data',
    s3_prefix='de-intro-project-jb/dev',
    dry_run=True
)

NoCredentialsError: Unable to locate credentials

### Load data

In [33]:
import s3fs

def list_parquet_files_from_s3(bucket_name: str, s3_prefix: str) -> list:
    """
    Lists all Parquet files in a given S3 prefix using s3fs.

    Parameters:
        bucket_name (str): S3 bucket name.
        s3_prefix (str): Prefix (folder path) in the bucket.

    Returns:
        list: List of full S3 paths to Parquet files.
    """
    fs = s3fs.S3FileSystem()
    s3_path = f"s3://{bucket_name}/{s3_prefix}"
    files = fs.ls(s3_path)
    return [f for f in files if f.endswith('.parquet')]

In [34]:
from arrow_pd_parser import reader
import pandas as pd

def load_parquet_files_from_s3(bucket_name, s3_prefix):
    """
    Loads and parses Parquet files from S3 using PyArrow and a custom parser.

    Parameters:
        bucket_name (str): S3 bucket name.
        s3_prefix (str): Prefix (folder path) in the bucket.

    Returns:
        pd.DataFrame: Combined DataFrame from all Parquet files.
    """
    #from your_module import list_parquet_files_from_s3  # adjust import as needed

    parquet_files = list_parquet_files_from_s3(bucket_name, s3_prefix)
    all_dfs = []

    for file_path in parquet_files:
        df = reader.read(file_path)  # reader handles S3 paths directly
        all_dfs.append(df)

    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()


In [35]:
#load data
bucket = "alpha-hmcts-de-testing-sandbox"
prefix = "de-intro-project-jb/dev"

df = load_parquet_files_from_s3(bucket, prefix)
df.head()



NoCredentialsError: Unable to locate credentials

### Metadata

In [36]:
#libraries
import pandas as pd
from arrow_pd_parser import reader

#function to load and coerse parquet files to datetime format as per metadata
def load_and_fix_parquet_with_metadata(
    parquet_path: str,
    metadata_path: str,
    datetime_columns: list
) -> pd.DataFrame:
    """
    Parameters:
    - parquet_path: Path to the Parquet file
    - metadata_path: Path to the JSON metadata file
    - datetime_columns: List of column names to convert to datetime

    Returns:
    - Cleaned Pandas DataFrame
    """
    #load parquet with metadata
    df = reader.read(
        input_path = parquet_path,
        metadata = metadata_path,
        parquet_expect_full_schema = False  # Allows partial schema match
    )

    # Coerce datetime columns
    for col in datetime_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")

    return df

In [37]:
#people-part1 df
df1 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df1.head()

Unnamed: 0,User Id,First Name,Last Name,Email,Phone,Date of birth,Job Title,Source extraction date,Index
0,e09c4f4cbfEFaFd,Dawn,Trevino,clintongood@example.org,360-423-5286,1972-01-17,"Teacher, primary school",2024-02-29 12:30:10,1
1,D781D28b845Ab9D,Dale,Mcknight,clairebradshaw@example.org,9062423229,1931-01-31,"Development worker, community",2024-02-29 12:30:10,2
2,eda7EcaF87b2D80,Herbert,Bean,johnnybooker@example.org,001-149-154-0679x1617,2018-02-10,Ceramics designer,2024-02-29 12:30:10,3
3,E75ACea5D7AeC3e,Karen,Everett,wkhan@example.org,870.294.7563x20939,1938-06-14,"Civil engineer, consulting",2024-02-29 12:30:10,4
4,9C4Df1246ddf543,Angela,Shea,reginaldgarner@example.com,242.442.2978,1971-11-22,Health and safety adviser,2024-02-29 12:30:10,5


In [43]:
#check data types
print(df1.dtypes)

User Id                   string[python]
First Name                string[python]
Last Name                 string[python]
Email                     string[python]
Phone                     string[python]
Date of birth             datetime64[ns]
Job Title                 string[python]
Source extraction date    datetime64[ns]
Index                              Int64
dtype: object


In [38]:

from mojap_metadata import Metadata

# Path to your metadata file
metadata_path = "data/metadata/intro-project-metadata.json"

# Create Metadata object from JSON
metadata_obj = Metadata.from_json(metadata_path)

# Check the object
print(metadata_obj.name)          # Project name


Intro Project


In [40]:
metadata_obj.columns      # List of column definitions

[{'name': 'User ID', 'type': 'string', 'description': ''},
 {'name': 'First Name', 'type': 'string', 'description': 'A Users first name'},
 {'name': 'Last Name', 'type': 'string', 'description': 'A Users last name'},
 {'name': 'Email', 'type': 'string', 'description': 'A Users email address'},
 {'name': 'Phone', 'type': 'string', 'description': 'A Users phone number'},
 {'name': 'Date of birth',
  'type': 'timestamp(s)',
  'datetime_format': '%Y-%m-%dT%H:%M:%S',
  'description': 'A Users date of birth'},
 {'name': 'Job Title', 'type': 'string', 'description': 'A Users job title'},
 {'name': 'Source extraction date',
  'type': 'timestamp(s)',
  'datetime_format': '%Y-%m-%dT%H:%M:%S',
  'description': "Timestamp for start of record's presence in database"}]

In [41]:
#validate metadata against schema
metadata_obj.validate()

In [None]:
#need a function to cast columns to correct types - this is needed because mojap metadata does not do casting, just validates what the schema should be
#then apply that function to the dataset

def enforce_metadata_types(df, metadata_obj):
    """
    Enforce column data types in a DataFrame based on mojap-metadata schema.
    
    Args:
        df (pd.DataFrame): The DataFrame to enforce types on.
        metadata_obj (Metadata): Metadata object from mojap-metadata.
    
    Returns:
        pd.DataFrame: DataFrame with enforced types.
    """
    for col in metadata_obj.columns:
        col_name = col["name"]
        col_type = col["type"]

        if col_name not in df.columns:
            print(f"⚠️ Column '{col_name}' not found in DataFrame.")
            continue

        if col_type == "string":
            df[col_name] = df[col_name].astype("string")

        elif col_type.startswith("timestamp"):
            fmt = col.get("datetime_format", None)
            df[col_name] = pd.to_datetime(df[col_name], format=fmt, errors="coerce")

        # Add more type handling if needed (e.g., int, float)
    
    return df
