In [34]:
import os
import sys
import re

from unidecode import unidecode

import pandas as pd
from pydantic import BaseModel
from enum import Enum
from datetime import datetime

from tdd import validate_and_convert_to_df # import do tdd (contrato de dados)

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, "..", "..", ".."))
data_dir = os.path.abspath(os.path.join(current_dir, "..", "..", "..", "..", "data"))
sys.path.append(parent_dir)

from backend.utils.functions import read_json

In [57]:
def metadata(df: pd.DataFrame) -> pd.DataFrame:
    """Info de processamento"""
    df['date'] = datetime.now().strftime('%Y-%m-%d')
    df['hour'] = datetime.now().strftime('%H:%M:%S')
    df['source'] = 'openbrewerydb'
    return df

def clean_partition_value(s):
    s = str(s) if s is not None else "" # converte None ou Nan para string
    s = unidecode(s) # remove os acentos usando o unidecode
    s = re.sub(r'\s+', '_', s.strip()) #remove qualquer espaço por '_'
    s = re.sub(r'[^a-zA-Z0-9_]', '', s) # remove tudo que não for letra (a-zA-Z), número (0-9) ou underline
    s = s.lower() # joga para minusculo o conteudo string
    return s


In [58]:
class BreweryType(str, Enum): # Categorizando a coluna 'brewery_type'
    micro = "micro"
    large = "large"
    closed = "closed"
    brewpub = "brewpub"
    proprietor = "proprietor"
    contract = "contract"
    regional = "regional"
    planning = "planning"
    taproom = "taproom"
    bar = "bar"
    nano = "nano"
    beergarden = "beergarden"
    location = "location"


class Brewery(BaseModel):
    """
    Definição do esquema retornado da API
    """

    id: str
    name: str
    brewery_type: str #| BreweryType
    address_1: str | None
    address_2: str | None
    address_3: str | None
    city: str
    state: str
    postal_code: str | None
    country: str
    longitude: float | None
    latitude: float | None
    phone: str | None
    website_url: str | None
    state: str
    street: str | None

In [59]:
# leitura do conteudo json baixado
data = read_json(
    file_path=os.path.join(data_dir, "bronze"),
    name_file="breweries_all"
)

In [60]:
# Chamada da função de vdt
df = validate_and_convert_to_df(
    data=data,
    model=Brewery
)

In [61]:
# remoção de duplicados
df = df.drop_duplicates(subset=['id'])

In [62]:
# aplicar a transformação das colunas que vão ser usadas no particionamento
df['country'] = df['country'].apply(clean_partition_value)
df['state'] = df['state'].apply(clean_partition_value)

In [63]:
# construcao de metadados
df = metadata(df=df)

In [64]:
print(df.shape)
df.head()

(8408, 18)


Unnamed: 0,id,name,brewery_type,address_1,address_2,address_3,city,state,postal_code,country,longitude,latitude,phone,website_url,street,date,hour,source
0,5128df48-79fc-4f0f-8b52-d06be54d0cec,(405) Brewing Co,micro,1716 Topeka St,,,Norman,oklahoma,73069-8224,united_states,-97.468182,35.257389,4058160490,http://www.405brewing.com,1716 Topeka St,2025-07-20,12:15:26,openbrewerydb
1,9c5a66c8-cc13-416f-a5d9-0a769c87d318,(512) Brewing Co,micro,407 Radam Ln Ste F200,,,Austin,texas,78745-1197,united_states,,,5129211545,http://www.512brewing.com,407 Radam Ln Ste F200,2025-07-20,12:15:26,openbrewerydb
2,34e8c68b-6146-453f-a4b9-1f6cd99a5ada,1 of Us Brewing Company,micro,8100 Washington Ave,,,Mount Pleasant,wisconsin,53406-3920,united_states,-87.883364,42.720108,2624847553,https://www.1ofusbrewing.com,8100 Washington Ave,2025-07-20,12:15:26,openbrewerydb
3,6d14b220-8926-4521-8d19-b98a2d6ec3db,10 Barrel Brewing Co,large,62970 18th St,,,Bend,oregon,97701-9847,united_states,-121.281706,44.086835,5415851007,http://www.10barrel.com,62970 18th St,2025-07-20,12:15:26,openbrewerydb
4,e2e78bd8-80ff-4a61-a65c-3bfbd9d76ce2,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,,,Bend,oregon,97703-2465,united_states,-121.328802,44.057565,5415851007,,1135 NW Galveston Ave Ste B,2025-07-20,12:15:26,openbrewerydb


In [67]:
# add try-except
df.to_parquet(
    path=os.path.join(data_dir, "silver", "breweries.parquet"),
    engine='pyarrow',
    partition_cols=['country', 'state'],
    index=False
)