### BAIXAR E CONSOLIDAR ARQUIVOS DA TISS HOSPITALAR

In [31]:
# Importar as bibliotecas necessárias
import requests
from bs4 import BeautifulSoup
import zipfile
import os
import pandas as pd


In [2]:
# Definindo os dados utilizados para padrão dos diretórios
BASE_URL = 'https://dadosabertos.ans.gov.br/FTP/PDA/TISS/HOSPITALAR/'
YEARS = ['2020', '2021', '2022']
STATES = ['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO']

In [5]:
# Criação das funções necessárias
def get_links_from_page(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return [link.get('href') for link in soup.find_all('a')]

def download_file(url, filename):
    response = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)


In [6]:
# Baixando os arquivos (para este caso leva cerca de 9 minutos)
all_files = []
for year in YEARS:
    for state in STATES:
        folder_url = BASE_URL + year + '/' + state + '/'
        local_dir = os.path.join(year, state)
        
        # Verificar e criar o diretório local se ele não existir
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)
        
        links = get_links_from_page(folder_url)
        for link in links:
            if link.endswith('HOSP_CONS.zip'):
                zip_name = os.path.join(local_dir, link)
                full_link = folder_url + link
                download_file(full_link, zip_name)

                with zipfile.ZipFile(zip_name, 'r') as zip_ref:
                    zip_ref.extractall(local_dir)
                    csv_file = zip_name.replace('.zip', '.csv')
                    all_files.append(csv_file)

In [7]:
# Apendar CSVs
dfs = [pd.read_csv(file) for file in all_files]
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv('combined_data.csv', index=False)




