In [1]:
import os
from pathlib import Path
import re
from typing import List
from pydantic import ValidationError

import toml
from dotenv import find_dotenv, load_dotenv
from datawagon.objects.csv_file_info_override import CsvFileInfoOverride
from datawagon.objects.file_utils import FileUtils
from datawagon.objects.source_config import (
    Source,
    SourceConfig,
    SourceFileAttributes,
)
from datawagon.objects.source_file_scanner import SourceFiles


load_dotenv(find_dotenv(), verbose=True)


def source_base_path() -> Path:
    source_base_path = Path(str(os.getenv("DW_CSV_SOURCE_DIR")))

    if not source_base_path.exists():
        raise ValueError(f"Source directory does not exist: {source_base_path}")
    return source_base_path


def load_config() -> SourceConfig:
    source_config_file = toml.load(str(os.getenv("DW_CSV_SOURCE_TOML")))

    try:
        valid_config = SourceConfig(**source_config_file)
        return valid_config
    except ValidationError as e:
        raise ValueError(e)


def source_file_attrs(file_path: Path, file_source: Source) -> SourceFileAttributes:
    file_dict = {"file_name": file_path.name, "file_path": file_path}

    if file_source.regex_pattern and file_source.regex_group_names:
        r_pattern = file_source.regex_pattern
        r_groups = file_source.regex_group_names

        match = re.match(r_pattern, file_path.name)

        if not match:
            raise ValueError(f"Invalid file name format: {file_path.name}")

        if len(r_groups) != len(match.groups()):
            raise ValueError(
                "File regex config generated mismatched groups."
                + f"\nFile name: {file_path.name}, regex: {r_pattern}, groups: {r_groups}"
                ""
            )

        for i in range(len(match.groups())):
            file_dict[r_groups[i]] = match.group(i + 1)

    return SourceFileAttributes(**file_dict)


def available_source_files() -> List[SourceFiles]:
    all_available_files = []

    valid_config = load_config()

    for file_id in valid_config.source:
        file_source = valid_config.source[file_id]
        if file_source.is_enabled:
            file_list = FileUtils().scan_for_csv_files_with_name(
                source_base_path(), file_source.file_name_base
            )

            if len(file_list) == 0:
                print(f"Did not find any files with name: {file_source.file_name_base}")
                break

            table_mapper = SourceFiles(
                table_name=file_source.destination_table,
                append_or_replace=file_source.append_or_replace,
            )

            for file_path in file_list:
                source_file = source_file_attrs(file_path, file_source)
                source_file_info = CsvFileInfoOverride.build_data_item(source_file, file_source.destination_table)
                table_mapper.files.append(source_file_info)

            all_available_files.append(table_mapper)

    return all_available_files


all_available_files = available_source_files()
print(all_available_files[0])

table_name='claim_raw' append_or_replace='append' files=[CsvFileInfoOverride(file_path=PosixPath('/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1/YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1.csv.gz'), file_dir='/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1', file_name='YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1.csv.gz', file_name_without_extension='YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1', file_version='v1-1', table_name='claim_raw', file_size_in_bytes=32868194, file_size='31.35 MB'), CsvFileInfoOverride(file_path=PosixPath('/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1/YouTube_CaravanInc_M_20230101_claim_raw_v1-1.csv.gz'), file_dir='/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1', file_name='YouTube_CaravanInc_M_20230101_claim_raw_v1-1.csv.gz', file_name_without_extension='YouTube_CaravanInc_M_20230101_claim_raw_v1-1', file_version='v1-1', table_name

In [2]:
from datawagon.objects.source_file_scanner import SourceFilesToDatabase

file_loader = SourceFilesToDatabase()

all_available_files = file_loader.available_source_files()
print(all_available_files[0])

table_name='claim_raw' append_or_replace='append' files=[CsvFileInfoOverride(file_path=PosixPath('/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1/YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1.csv.gz'), file_dir='/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1', file_name='YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1.csv.gz', file_name_without_extension='YouTube_CaravanAffiliates_M_20230101_claim_raw_v1-1', file_version='v1-1', table_name='claim_raw', file_size_in_bytes=32868194, file_size='31.35 MB'), CsvFileInfoOverride(file_path=PosixPath('/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1/YouTube_CaravanInc_M_20230101_claim_raw_v1-1.csv.gz'), file_dir='/Users/jm/temp/caravan/Caravan Historical Data/2023/1.23/0_claim_raw_v1-1', file_name='YouTube_CaravanInc_M_20230101_claim_raw_v1-1.csv.gz', file_name_without_extension='YouTube_CaravanInc_M_20230101_claim_raw_v1-1', file_version='v1-1', table_name