# 08 - Fluxo de dados no Hugging Face (upload datasets processados)

Este notebook sobe os arquivos processados (df_clean, df_log, df_capped, folds) para um dataset no HF Hub.



In [11]:
# Config e imports
import os
from pathlib import Path
from huggingface_hub import HfApi, create_repo, HfFolder

# Carrega variáveis do .env
from dotenv import load_dotenv
load_dotenv(override=True)

def get_env_strip_quotes(key: str) -> str:
    val = os.getenv(key, '')
    return val.strip('\'"')

HF_TOKEN = get_env_strip_quotes('HF_TOKEN')
HF_PROCESSED_REPO = os.getenv('HF_PROCESSED_REPO', 'henriquebap/wine-ml-processed')
HF_PRIVATE = bool(int(os.getenv('HF_PRIVATE', '1')))

if not HF_TOKEN:
    raise ValueError('Defina HF_TOKEN no .env para autenticar no HF Hub.')

# Salva token e cria API
HfFolder.save_token(HF_TOKEN)
api = HfApi()

# Cria (ou reusa) repo
create_repo(repo_id=HF_PROCESSED_REPO, private=HF_PRIVATE, exist_ok=True, token=HF_TOKEN, repo_type='dataset')
print('Repo criado/reusado:', HF_PROCESSED_REPO)

# Resolve base preferindo onde estão os arquivos gerados pelo EDA (notebooks/...)
root_base = Path.cwd().parent / 'data' / 'processed'
nb_base = Path.cwd() / 'data' / 'processed'
# prioriza o local que contém df_capped.csv; senão usa o que existir
candidates = [nb_base, root_base]
base = None
for b in candidates:
    if (b / 'df_capped.csv').exists() or (b / 'stratified_folds.json').exists():
        base = b
        break
if base is None:
    base = nb_base if nb_base.exists() else root_base

files = [
    base / 'df_clean.csv',
    base / 'df_log.csv',
    base / 'df_capped.csv',
    base / 'df_capped_train.csv',
    base / 'df_capped_test.csv',
    base / 'stratified_folds.json',
]
extra = Path.cwd().parent / 'reports' / 'eda' / 'selected_features.csv'
if not extra.exists():
    extra = Path.cwd() / 'reports' / 'eda' / 'selected_features.csv'
if extra.exists():
    files.append(extra)

print('Base local:', base)
print('Repo alvo:', HF_PROCESSED_REPO)
for f in files:
    print('local file:', f, 'exists:', f.exists())


Repo criado/reusado: henriquebap/wine-ml-processed
Base local: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed
Repo alvo: henriquebap/wine-ml-processed
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed/df_clean.csv exists: True
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed/df_log.csv exists: True
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed/df_capped.csv exists: True
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed/df_capped_train.csv exists: True
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data/processed/df_capped_test.csv exists: True
local file: /Users/henriquebap/Pessoal/Personal - Projects/Wine_MLProject/wine-ml-app/notebooks/data

In [12]:
# Gera CSVs de train/test a partir dos folds
import json
import pandas as pd
folds_path = base / 'stratified_folds.json'
df_capped_path = base / 'df_capped.csv'
if folds_path.exists() and df_capped_path.exists():
    df = pd.read_csv(df_capped_path)
    with open(folds_path, 'r') as f:
        folds = json.load(f)
    # usa fold 0 como exemplo didático
    fold0 = folds[0]
    tr_idx, te_idx = fold0['train_idx'], fold0['test_idx']
    df.iloc[tr_idx].to_csv(base / 'df_capped_train.csv', index=False)
    df.iloc[te_idx].to_csv(base / 'df_capped_test.csv', index=False)
    print('Splits salvos: df_capped_train.csv, df_capped_test.csv')
else:
    print('Pulando geração de splits: arquivos não encontrados')


Splits salvos: df_capped_train.csv, df_capped_test.csv


In [13]:
# Upload batch com mapeamento para subpastas (raw/log/processed/folds/eda)
from huggingface_hub import CommitOperationAdd

HF_SPLIT = (os.getenv('HF_SPLIT') or 'all').lower()

def map_path_in_repo(p: str) -> str:
    name = Path(p).name
    if name == 'df_clean.csv': return 'raw/df_clean.csv'
    if name == 'df_log.csv': return 'log/df_log.csv'
    if name == 'df_capped.csv': return 'processed/full.csv'
    if name == 'df_capped_train.csv': return 'processed/train.csv'
    if name == 'df_capped_test.csv': return 'processed/test.csv'
    if name == 'stratified_folds.json': return 'folds/stratified_folds.json'
    if name == 'selected_features.csv': return 'eda/selected_features.csv'
    return f'other/{name}'

def include_by_split(path: Path) -> bool:
    name = path.name
    if HF_SPLIT == 'all': return True
    if HF_SPLIT == 'full': return name == 'df_capped.csv'
    if HF_SPLIT == 'train': return name == 'df_capped_train.csv'
    if HF_SPLIT == 'test': return name == 'df_capped_test.csv'
    return True

# Prepara operações de upload
ops = []
for path in files:
    if path.exists() and include_by_split(path):
        target = map_path_in_repo(str(path))
        ops.append(CommitOperationAdd(path_in_repo=target, path_or_fileobj=str(path)))
        print('staged:', path.name, '->', target)
    else:
        print('skip (missing or filtered):', path.name)

# Commit batch
if ops:
    api.create_commit(
        repo_id=HF_PROCESSED_REPO,
        repo_type='dataset',
        operations=ops,
        token=HF_TOKEN,
        commit_message=f"Add {len(ops)} processed files (split={HF_SPLIT})",
        create_pr=False,
    )
    print(f'✅ Commit enviado: {len(ops)} arquivos')
else:
    print('Nenhum arquivo para commit')

print('Concluído. Abra:', f'https://huggingface.co/datasets/{HF_PROCESSED_REPO}')


staged: df_clean.csv -> raw/df_clean.csv
staged: df_log.csv -> log/df_log.csv
staged: df_capped.csv -> processed/full.csv
staged: df_capped_train.csv -> processed/train.csv
staged: df_capped_test.csv -> processed/test.csv
staged: stratified_folds.json -> folds/stratified_folds.json
✅ Commit enviado: 6 arquivos
Concluído. Abra: https://huggingface.co/datasets/henriquebap/wine-ml-processed
