In [2]:
import pandas as pd
import requests
import os
import time


In [None]:
url = 'https://dadosabertos.camara.leg.br/arquivos/{item}/csv/{item}-{year}.csv'


def download_item(year, item, overwrite_csv=False, overwrite_parquet=True):
    
    source_path = '../data/{item}/csv'
    sink_path = '../data/{item}/parquet'
    source_path_formatted = os.path.abspath(source_path.format(item=item))
    sink_path_formatted = os.path.abspath(sink_path.format(item=item))
    url_formatted = url.format(item=item, year=year)

    os.makedirs(source_path_formatted, exist_ok=True)
    os.makedirs(sink_path_formatted, exist_ok=True)
    print(f"Directories ensured: {source_path_formatted}, {sink_path_formatted}")

    file_path = os.path.join(source_path_formatted, f'{item}-{year}.csv')
    if (not os.path.exists(file_path)) or overwrite_csv:
        print(f'Downloading {item}-{year}.csv...')
        response = requests.get(url_formatted)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f'File saved to {file_path}')
        else:
            print(f'Failed to download file for year {year}: {response.status_code}')
            raise Exception(f'Error downloading file for year {year}: {response.status_code}')
    else:
        print(f'File for year {year} already exists.')

    # Convert to parquet and save to sink_path
    parquet_file_path = os.path.join(sink_path_formatted, f'{item}-{year}.parquet')
    if (not os.path.exists(parquet_file_path)) or overwrite_parquet:
        print(f'Converting {file_path} to Parquet format...')
        df = pd.read_csv(file_path, sep=";").drop_duplicates()
        df.columns = [col.replace('.', '_') for col in df.columns]
        df.to_parquet(parquet_file_path, index=False)
        print(f'File saved to {parquet_file_path}')
    else:
        print(f'Parquet file for year {year} already exists.')
        
items = ['proposicoes', 'proposicoesTemas', 'proposicoesAutores', 'eventos', 'eventosOrgaos', 
            'eventosPresencaDeputados', 'eventosRequerimentos', 'votacoes', 'votacoesVotos', 
            'VotacoesOrientacoes', 'votacoesObjetos', 'votacoesProposicoes', 'licitacoes',
            'licitacoesContratos', 'licitacoesItens', 'licitacoesPedidos', 'licitacoesPropostas']
items = ['eventos']
for year in range(2025, 2000 , -1):
    for item in items:
        try:
            download_item(year, item, overwrite_csv=False, overwrite_parquet=True)
            time.sleep(1)  # Sleep to avoid overwhelming the server
        except Exception as e:
            continue
    

Directories ensured: d:\projetos\pipeline-deputados\data\eventos\csv, d:\projetos\pipeline-deputados\data\eventos\parquet
File for year 2025 already exists.
Converting d:\projetos\pipeline-deputados\data\eventos\csv\eventos-2025.csv to Parquet format...
File saved to d:\projetos\pipeline-deputados\data\eventos\parquet\eventos-2025.parquet
Directories ensured: d:\projetos\pipeline-deputados\data\eventos\csv, d:\projetos\pipeline-deputados\data\eventos\parquet
File for year 2024 already exists.
Converting d:\projetos\pipeline-deputados\data\eventos\csv\eventos-2024.csv to Parquet format...
File saved to d:\projetos\pipeline-deputados\data\eventos\parquet\eventos-2024.parquet
Directories ensured: d:\projetos\pipeline-deputados\data\eventos\csv, d:\projetos\pipeline-deputados\data\eventos\parquet
File for year 2023 already exists.
Converting d:\projetos\pipeline-deputados\data\eventos\csv\eventos-2023.csv to Parquet format...
File saved to d:\projetos\pipeline-deputados\data\eventos\parque