In [10]:
import pandas as pd
import pathlib
import os
from dotenv import load_dotenv
from typing import Dict, Any, Literal, Union


In [3]:

def create_sample_transaction_data() -> pd.DataFrame:
    """
    Creates a sample DataFrame for commercial credit card transactions.
    """
    data = {
        'TransactionID': [1001, 1002, 1003, 1004, 1005, 1006],
        'CardholderName': ['John Doe', 'Jane Smith', 'Peter Jones', 'John Doe', 'Jane Smith', 'Sarah Lee'],
        'CardNumber': ['XXXX-1234', 'XXXX-5678', 'XXXX-9012', 'XXXX-1234', 'XXXX-5678', 'XXXX-3456'],
        'TransactionDate': pd.to_datetime([
            '2025-08-15 09:30:00', '2025-08-16 14:15:00', '2025-08-17 11:00:00',
            '2025-08-18 18:45:00', '2025-08-19 08:00:00', '2025-08-20 10:20:00'
        ]),
        'MerchantName': ['Office Depot', 'United Airlines', 'Hilton Garden Inn', 'Uber Eats', 'Staples', 'Amazon Web Services'],
        'MerchantCategoryCode': [5044, 4511, 7011, 5812, 5111, 4814],
        'TransactionAmount': [152.75, 850.50, 210.00, 45.20, 89.99, 500.00],
        'Currency': ['USD', 'USD', 'USD', 'USD', 'USD', 'USD'],
        'GLCode': [6050, 5010, 5020, 6060, 6050, 6080],
        'ProjectCode': ['P-101', 'P-102', 'P-102', 'P-101', 'P-103', 'P-104'],
        'IsApproved': [True, True, True, True, False, True]
    }
    return pd.DataFrame(data)

# Ensure this script is in the same directory as your .env file, which should contain:
# DATA_DIR_RAW=data/raw
# DATA_DIR_PROCESSED=data/processed
load_dotenv()

False

In [4]:
if __name__ == "__main__":
    # Define directory paths using environment variables with defaults
    DATA_DIR_RAW = pathlib.Path(os.getenv("DATA_DIR_RAW", "data/raw"))
    DATA_DIR_PROCESSED = pathlib.Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

    # Create the directories if they don't exist
    print("Creating directories...")
    DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
    DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)
    print("Directories created/verified.")

    # Create the sample commercial credit card transaction data
    print("\nGenerating sample commercial credit card transaction data...")
    df = create_sample_transaction_data()
    print("Sample DataFrame created successfully.")
    print(df.head())

    # --- Save to CSV ---
    csv_file_path = DATA_DIR_RAW / "commercial_credit_card_transactions.csv"
    print(f"\nSaving DataFrame to CSV at: {csv_file_path}")
    df.to_csv(csv_file_path, index=False)
    print("CSV file saved successfully!")

    # --- Save to Parquet ---
    parquet_file_path = DATA_DIR_PROCESSED / "commercial_credit_card_transactions.parquet"
    print(f"Saving DataFrame to Parquet at: {parquet_file_path}")
    df.to_parquet(parquet_file_path)
    print("Parquet file saved successfully!")

    print("\nScript finished.")

Creating directories...
Directories created/verified.

Generating sample commercial credit card transaction data...
Sample DataFrame created successfully.
   TransactionID CardholderName CardNumber     TransactionDate  \
0           1001       John Doe  XXXX-1234 2025-08-15 09:30:00   
1           1002     Jane Smith  XXXX-5678 2025-08-16 14:15:00   
2           1003    Peter Jones  XXXX-9012 2025-08-17 11:00:00   
3           1004       John Doe  XXXX-1234 2025-08-18 18:45:00   
4           1005     Jane Smith  XXXX-5678 2025-08-19 08:00:00   

        MerchantName  MerchantCategoryCode  TransactionAmount Currency  \
0       Office Depot                  5044             152.75      USD   
1    United Airlines                  4511             850.50      USD   
2  Hilton Garden Inn                  7011             210.00      USD   
3          Uber Eats                  5812              45.20      USD   
4            Staples                  5111              89.99      USD   

   

In [7]:
import pandas as pd
from typing import Dict, Any

def validate_dataframe(df: pd.DataFrame, expected_shape: tuple, expected_dtypes: Dict[str, Any]) -> dict:
    """
    Performs validation checks on a pandas DataFrame.
    """
    validation_results = {}
    
    # Check shape
    if df.shape == expected_shape:
        validation_results['Shape Check'] = f"✅ PASSED. Shape is {df.shape}."
    else:
        validation_results['Shape Check'] = f"❌ FAILED. Expected {expected_shape}, but got {df.shape}."
        
    # Check critical column dtypes
    dtype_mismatches = []
    for col, expected_dtype in expected_dtypes.items():
        if col not in df.columns:
            dtype_mismatches.append(f"Column '{col}' is missing.")
        elif str(df[col].dtype) != str(expected_dtype):
            dtype_mismatches.append(f"Column '{col}': Expected '{expected_dtype}', but got '{df[col].dtype}'.")
            
    if not dtype_mismatches:
        validation_results['Dtype Check'] = "✅ PASSED. All critical column dtypes are correct."
    else:
        validation_results['Dtype Check'] = "❌ FAILED. Dtype mismatches found:\n- " + "\n- ".join(dtype_mismatches)
        
    return validation_results

# --- The code below is what you need to add ---

# Create the sample commercial credit card transaction data
data = {
    'TransactionID': [1001, 1002, 1003, 1004, 1005, 1006],
    'CardholderName': ['John Doe', 'Jane Smith', 'Peter Jones', 'John Doe', 'Jane Smith', 'Sarah Lee'],
    'TransactionDate': pd.to_datetime(['2025-08-15', '2025-08-16', '2025-08-17', '2025-08-18', '2025-08-19', '2025-08-20']),
    'TransactionAmount': [152.75, 850.50, 210.00, 45.20, 89.99, 500.00],
    'IsApproved': [True, True, True, True, False, True]
}
df = pd.DataFrame(data)

# --- Define expected values for validation ---
expected_rows = 6
expected_cols = 5
expected_shape = (expected_rows, expected_cols)
expected_dtypes = {
    'TransactionID': 'int64',
    'TransactionDate': 'datetime64[ns]',
    'TransactionAmount': 'float64',
    'IsApproved': 'bool'
}

# --- Run the validation by calling the function and assigning the result ---
print("Running validation on the DataFrame...\n")
validation_report = validate_dataframe(df, expected_shape, expected_dtypes)

# --- Display the results by printing the returned dictionary ---
for check, result in validation_report.items():
    print(f"{check}: {result}")

Running validation on the DataFrame...

Shape Check: ✅ PASSED. Shape is (6, 5).
Dtype Check: ✅ PASSED. All critical column dtypes are correct.


In [11]:
def write_df(df: pd.DataFrame, file_path: pathlib.Path, file_type: Literal['csv', 'parquet']):
    """
    Saves a DataFrame to a specified file path based on file type.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        file_path (pathlib.Path): The full path to the file, including the filename.
        file_type (Literal['csv', 'parquet']): The format to save in.
    """
    # Create the parent directory if it doesn't exist
    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        if file_type == 'csv':
            df.to_csv(file_path, index=False)
            print(f"✅ DataFrame successfully saved to CSV at: {file_path}")
        elif file_type == 'parquet':
            df.to_parquet(file_path, index=False)
            print(f"✅ DataFrame successfully saved to Parquet at: {file_path}")
        else:
            raise ValueError("Unsupported file type. Use 'csv' or 'parquet'.")
    except ImportError:
        print(f"❌ Error: Required library for '{file_type}' is not installed.")
        if file_type == 'parquet':
            print("Please install pyarrow or fastparquet: `pip install pyarrow` or `pip install fastparquet`.")
        raise
    except Exception as e:
        print(f"❌ An error occurred while writing the DataFrame: {e}")
        raise

def read_df(file_path: pathlib.Path) -> Union[pd.DataFrame, None]:
    """
    Reads a DataFrame from a file based on its suffix (.csv or .parquet).

    Args:
        file_path (pathlib.Path): The full path to the file to read.

    Returns:
        pd.DataFrame: The loaded DataFrame.
        None: If the file does not exist or an error occurs.
    """
    if not file_path.exists():
        print(f"❌ Error: File not found at {file_path}")
        return None
    
    try:
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
            print(f"✅ DataFrame successfully read from CSV: {file_path}")
        elif file_path.suffix == '.parquet':
            df = pd.read_parquet(file_path)
            print(f"✅ DataFrame successfully read from Parquet: {file_path}")
        else:
            print(f"❌ Error: Unsupported file suffix '{file_path.suffix}'. Only .csv and .parquet are supported.")
            return None
        return df
    except ImportError:
        print(f"❌ Error: Required library for '{file_path.suffix}' is not installed.")
        if file_path.suffix == '.parquet':
            print("Please install pyarrow or fastparquet: `pip install pyarrow` or `pip install fastparquet`.")
        return None
    except Exception as e:
        print(f"❌ An error occurred while reading the DataFrame: {e}")
        return None

# --- Main Execution Block ---
if __name__ == "__main__":
    # Create a sample DataFrame for demonstration
    data = {'id': [1, 2, 3], 'value': ['a', 'b', 'c']}
    sample_df = pd.DataFrame(data)

    print("--- Writing DataFrames ---")
    
    # Define file paths using the configured directories
    csv_path = DATA_DIR_RAW / "sample_data.csv"
    parquet_path = DATA_DIR_PROCESSED / "sample_data.parquet"

    # Save the sample DataFrame to CSV
    write_df(sample_df, csv_path, 'csv')

    # Save the sample DataFrame to Parquet
    write_df(sample_df, parquet_path, 'parquet')

    print("\n--- Reading DataFrames ---")

    # Read the data back from CSV
    df_from_csv = read_df(csv_path)
    if df_from_csv is not None:
        print("\nDataFrame read from CSV head:")
        print(df_from_csv.head())

    # Read the data back from Parquet
    df_from_parquet = read_df(parquet_path)
    if df_from_parquet is not None:
        print("\nDataFrame read from Parquet head:")
        print(df_from_parquet.head())
        
    # Demonstrate error handling for a missing file
    print("\n--- Demonstrating File Not Found Error ---")
    read_df(DATA_DIR_RAW / "non_existent_file.csv")

--- Writing DataFrames ---
✅ DataFrame successfully saved to CSV at: data/raw/sample_data.csv
✅ DataFrame successfully saved to Parquet at: data/processed/sample_data.parquet

--- Reading DataFrames ---
✅ DataFrame successfully read from CSV: data/raw/sample_data.csv

DataFrame read from CSV head:
   id value
0   1     a
1   2     b
2   3     c
✅ DataFrame successfully read from Parquet: data/processed/sample_data.parquet

DataFrame read from Parquet head:
   id value
0   1     a
1   2     b
2   3     c

--- Demonstrating File Not Found Error ---
❌ Error: File not found at data/raw/non_existent_file.csv
