In [35]:
import os
import sys

import pandas as pd
from pydantic import BaseModel
from enum import Enum
from datetime import datetime

from tdd import validate_and_convert_to_df # import do tdd (contrato de dados)

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, "..", "..", ".."))
data_dir = os.path.abspath(os.path.join(current_dir, "..", "..", "..", "..", "data"))
sys.path.append(parent_dir)

from backend.utils.functions import read_json

In [None]:
def metadata(df: pd.DataFrame) -> pd.DataFrame:
    """Info de processamento"""
    df['date'] = datetime.now().strftime('%Y-%m-%d')
    df['hour'] = datetime.now().strftime('%H:%M:%S')
    df['source'] = 'openbrewerydb'
    return df

In [None]:
class BreweryType(str, Enum): # Categorizando a coluna 'brewery_type'
    micro = "micro"
    large = "large"
    closed = "closed"
    brewpub = "brewpub"
    proprietor = "proprietor"
    contract = "contract"
    regional = "regional"
    planning = "planning"
    taproom = "taproom"
    bar = "bar"
    nano = "nano"
    beergarden = "beergarden"
    location = "location"


class Brewery(BaseModel):
    """
    Definição do esquema retornado da API
    """

    id: str
    name: str
    brewery_type: str | BreweryType
    address_1: str | None
    address_2: str | None
    address_3: str | None
    city: str
    state: str
    postal_code: str | None
    country: str
    longitude: float | None
    latitude: float | None
    phone: str | None
    website_url: str | None
    state: str
    street: str | None

In [None]:
# leitura do conteudo json baixado
data = read_json(
    file_path=os.path.join(data_dir, "bronze"),
    name_file="breweries_all"
)

In [None]:
# Chamada da função de vdt
df = validate_and_convert_to_df(
    data=data,
    model=Brewery
)

In [None]:
# remoção de duplicados
df = df.drop_duplicates(subset=['id'])

In [None]:
# construcao de metadados
df = metadata(df=df)

In [None]:
# add try-except
df.to_parquet(
    path=os.path.join(data_dir, "silver", "breweries.parquet"),
    engine='pyarrow',
    partition_cols=['country', 'state'],
    index=False
)