In [1]:
#check ipynb is running
print("Hello, World!")

Hello, World!


In [2]:
#set working directory
import os
os.chdir("/Users/jovita.brundziene/Python/airflow-de-intro-project-jbru")

In [None]:
#check working directory set to project root to use relative pathways later
#pwd()

To do:
- check if files exist before extracting and dumping to S3
- adapt load step to call data from S3
-  Go through repo steps
- Include dev/prod environment parameters
- add parameters to config file
- create a docker image
- create a github action to run pipeline automatically
- create unit tests
- modularise code into at least config, functions and run
- Update requirements file and build it into the script
- Requirements lint?
- Nice to have: package it up as a python package?
- add logging

### Extract data from local to S3

In [3]:
import boto3
import os
import logging


# this sets up the logging system to display messages with timestamps and severity levels.
logging.basicConfig(
    level=logging.INFO,  # Set to DEBUG for more detailed output
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# config
s3 = boto3.client('s3') # initialise S3 client
bucket = 'intro-project-jb' # name of your S3 bucket
local_dir = 'data/example-data' # local directory containing Parquet files
dry_run = True  # set to False to actually upload files

#  iterate through files in the local directory
for file in os.listdir(local_dir):
    if file.endswith('.parquet'):
        # script tells what would happen if true
        local_path = os.path.join(local_dir, file)
        s3_key = f'path/{file}'
        
        if dry_run:
            # simulate the upload without actually performing it
            print(f"[DRY RUN] Would upload: {local_path} to s3://{bucket}/{s3_key}")
        else:
            try:
                # attempt to upload the file to S3
                s3.upload_file(local_path, bucket, s3_key)
                logging.info(f"Successfully uploaded: {local_path} to s3://{bucket}/{s3_key}")
            except Exception as e:
                logging.error(f"Failed to upload: {local_path} to s3://{bucket}/{s3_key}. Error: {e}")

[DRY RUN] Would upload: data/example-data/people-part1.parquet to s3://intro-project-jb/path/people-part1.parquet
[DRY RUN] Would upload: data/example-data/people-part2.parquet to s3://intro-project-jb/path/people-part2.parquet
[DRY RUN] Would upload: data/example-data/people-part3.parquet to s3://intro-project-jb/path/people-part3.parquet


### Load data

In [4]:
#libraries
import pandas as pd
from arrow_pd_parser import reader

#function to load and coerse parquet files to datetime format as per metadata
def load_and_fix_parquet_with_metadata(
    parquet_path: str,
    metadata_path: str,
    datetime_columns: list
) -> pd.DataFrame:
    """
    Parameters:
    - parquet_path: Path to the Parquet file
    - metadata_path: Path to the JSON metadata file
    - datetime_columns: List of column names to convert to datetime

    Returns:
    - Cleaned Pandas DataFrame
    """
    #load parquet with metadata
    df = reader.read(
        input_path = parquet_path,
        metadata = metadata_path,
        parquet_expect_full_schema = False  # Allows partial schema match
    )

    # Coerce datetime columns
    for col in datetime_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")

    return df

In [5]:
#people-part1 df
df1 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df1.head()


Unnamed: 0,User Id,First Name,Last Name,Email,Phone,Date of birth,Job Title,Source extraction date,Index
0,e09c4f4cbfEFaFd,Dawn,Trevino,clintongood@example.org,360-423-5286,1972-01-17,"Teacher, primary school",2024-02-29 12:30:10,1
1,D781D28b845Ab9D,Dale,Mcknight,clairebradshaw@example.org,9062423229,1931-01-31,"Development worker, community",2024-02-29 12:30:10,2
2,eda7EcaF87b2D80,Herbert,Bean,johnnybooker@example.org,001-149-154-0679x1617,2018-02-10,Ceramics designer,2024-02-29 12:30:10,3
3,E75ACea5D7AeC3e,Karen,Everett,wkhan@example.org,870.294.7563x20939,1938-06-14,"Civil engineer, consulting",2024-02-29 12:30:10,4
4,9C4Df1246ddf543,Angela,Shea,reginaldgarner@example.com,242.442.2978,1971-11-22,Health and safety adviser,2024-02-29 12:30:10,5


In [6]:
#people-part1 df
df2 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df2.head()

Unnamed: 0,User Id,First Name,Last Name,Email,Phone,Date of birth,Job Title,Source extraction date,Index
0,e09c4f4cbfEFaFd,Dawn,Trevino,clintongood@example.org,360-423-5286,1972-01-17,"Teacher, primary school",2024-02-29 12:30:10,1
1,D781D28b845Ab9D,Dale,Mcknight,clairebradshaw@example.org,9062423229,1931-01-31,"Development worker, community",2024-02-29 12:30:10,2
2,eda7EcaF87b2D80,Herbert,Bean,johnnybooker@example.org,001-149-154-0679x1617,2018-02-10,Ceramics designer,2024-02-29 12:30:10,3
3,E75ACea5D7AeC3e,Karen,Everett,wkhan@example.org,870.294.7563x20939,1938-06-14,"Civil engineer, consulting",2024-02-29 12:30:10,4
4,9C4Df1246ddf543,Angela,Shea,reginaldgarner@example.com,242.442.2978,1971-11-22,Health and safety adviser,2024-02-29 12:30:10,5


In [7]:
#people-part1 df
df3 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df3.head()

Unnamed: 0,User Id,First Name,Last Name,Email,Phone,Date of birth,Job Title,Source extraction date,Index
0,e09c4f4cbfEFaFd,Dawn,Trevino,clintongood@example.org,360-423-5286,1972-01-17,"Teacher, primary school",2024-02-29 12:30:10,1
1,D781D28b845Ab9D,Dale,Mcknight,clairebradshaw@example.org,9062423229,1931-01-31,"Development worker, community",2024-02-29 12:30:10,2
2,eda7EcaF87b2D80,Herbert,Bean,johnnybooker@example.org,001-149-154-0679x1617,2018-02-10,Ceramics designer,2024-02-29 12:30:10,3
3,E75ACea5D7AeC3e,Karen,Everett,wkhan@example.org,870.294.7563x20939,1938-06-14,"Civil engineer, consulting",2024-02-29 12:30:10,4
4,9C4Df1246ddf543,Angela,Shea,reginaldgarner@example.com,242.442.2978,1971-11-22,Health and safety adviser,2024-02-29 12:30:10,5
