In [None]:
#check ipynb is running
print("Hello, World!")

In [None]:
#set working directory
#import os
os.chdir("/Users/jovita.brundziene/Python/airflow-de-intro-project-jbru")

In [None]:
#check working directory set to project root to use relative pathways later
#pwd()

### Extract data from local to S3

### Load data

In [None]:
#libraries
import pandas as pd
from arrow_pd_parser import reader

#function to load and coerse parquet files to datetime format as per metadata
def load_and_fix_parquet_with_metadata(
    parquet_path: str,
    metadata_path: str,
    datetime_columns: list
) -> pd.DataFrame:
    """
    Parameters:
    - parquet_path: Path to the Parquet file
    - metadata_path: Path to the JSON metadata file
    - datetime_columns: List of column names to convert to datetime

    Returns:
    - Cleaned Pandas DataFrame
    """
    #load parquet with metadata
    df = reader.read(
        input_path = parquet_path,
        metadata = metadata_path,
        parquet_expect_full_schema = False  # Allows partial schema match
    )

    # Coerce datetime columns
    for col in datetime_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")

    return df

In [None]:
#people-part1 df
df1 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df1.head()


In [None]:
#people-part1 df
df2 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df2.head()

In [None]:
#people-part1 df
df3 = load_and_fix_parquet_with_metadata(
    parquet_path = "data/example-data/people-part1.parquet",
    metadata_path = "data/metadata/intro-project-metadata.json",
    datetime_columns = ["Source extraction date", "Date of birth"]
)

df3.head()

In [None]:
#example metadata
mojap_schema = {
    "name": "users",
    "fields": [
        {
            "name": "user_id",
            "type": "integer",
            "nullable": False
         },
        {
            "name": "email",
            "type": "string",
            "nullable": False
         },
        {
            "name": "signup_date",
            "type": "date",
            "nullable": True
        }
    ]
}
mojap_schema

In [None]:
from pydantic import BaseModel
from typing import Optional
from datetime import date, datetime
from dlt.common.libs.pydantic import pydantic_to_table_schema_columns

def mojap_to_pydantic_model(schema: dict):
    fields = schema["fields"]
    annotations = {}
    defaults = {}

    type_mapping = {
        "string": str,
        "integer": int,
        "float": float,
        "boolean": bool,
        "date": date,
        "datetime": datetime
    }

    for field in fields:
        field_name = field["name"]
        field_type = type_mapping.get(field["type"], str)
        if field.get("nullable", True):
            annotations[field_name] = Optional[field_type]
            defaults[field_name] = None
        else:
            annotations[field_name] = field_type

    model_attrs = {"__annotations__": annotations}
    model_attrs.update(defaults)

    return type(schema["name"].capitalize() + "Model", (BaseModel,), model_attrs)

def convert_to_dlt_schema(pydantic_model):
    return {
        "name": pydantic_model.__name__.lower(),
        "columns": pydantic_to_table_schema_columns(pydantic_model)
    }