# Expirements

### imports

In [None]:
import duckdb 
import pandas as pd
import pyarrow.parquet as pq
from datetime import datetime
import os
import sys
from pathlib import Path

### Duckdb Parittion

In [None]:
cwd = Path.cwd()
processed_dir = cwd / 'data' / 'processed'
raw_dir = cwd / 'data' / 'raw'

print(f'processed_dir: {processed_dir}')
print(f'raw_dir: {raw_dir}')

# Connect to duckdb in memory
con = duckdb.connect(database=':memory:')

# Dynamically get file types from raw directory
file_types = []
for file_path in raw_dir.glob('*.parquet'):
    # Extract filename without extension
    file_type = file_path.stem
    file_types.append(file_type)

print(f'Found files: {file_types}')

partition_statement = ""

for file_type in file_types:
    con.execute(f"""
    CREATE TABLE IF NOT EXISTS {file_type} AS
    SELECT
        *
    FROM read_parquet('{raw_dir}/{file_type}.parquet')
    """)
    if file_type == 'customers':
        partition_statement = "PARTITION_BY (AccountCreationMonth)"
    if file_type == 'merchants':
        partition_statement = "PARTITION_BY (ingestion_date)"
    if file_type == 'transactions':
        partition_statement = "PARTITION_BY (TimestampMonth)"
    if file_type == 'login_attempts':
        partition_statement = "PARTITION_BY (LoginTimestampMonth)"

    statement = f"""
        COPY {file_type} TO '{processed_dir}/{file_type}' 
        (FORMAT parquet, {partition_statement}, OVERWRITE_OR_IGNORE)
    """

    # print(statement)
    
    # Write partitions
    con.execute(f"""
        COPY {file_type} TO '{processed_dir}/{file_type}' 
        (FORMAT parquet, {partition_statement}, OVERWRITE_OR_IGNORE)
    """)

### TEST Minio Upload Paths

In [None]:
data_path = '/Users/tro/Desktop/fraud-pipeline-patrol/data/processed'

# Check if the data directory exists
if not os.path.exists(data_path):
    print(f"Data directory not found: {data_path}")
    raise FileNotFoundError(f"Data directory not found: {data_path}")

data_path_obj = Path(data_path)
subdirs = [d for d in data_path_obj.iterdir() if d.is_dir()]
print(f"Found {len(subdirs)} data subdirectories: {[d.name for d in subdirs]}")

# Process each table directory
for subdir in subdirs:
    table_name = subdir.name
    print(f"Processing {table_name} directory")
    
    # Use recursive glob to find all parquet files in subdirectories
    # This will find files in partition subdirectories like AccountCreationMonth=1/
    parquet_files = list(subdir.glob('**/*.parquet'))
    print(f"Found {len(parquet_files)} parquet files in {table_name}")

    for file_path in parquet_files:
        # relative_to(data_path) 
        rel_path = file_path.relative_to(data_path)
        object_name = str(rel_path)
        
        print(f"Uploading {file_path} to MinIO as {object_name}")