In [None]:
import requests
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm, trange
from bs4 import BeautifulSoup as bs
import os
import sqlalchemy as sa
import zipfile
import dask.dataframe as dd

In [None]:
# configuacoes

urlbase = "https://dadosabertos.rfb.gov.br/CNPJ/"
# filelist = ["Cnaes.zip","LAYOUT_DADOS_ABERTOS_CNPJ.pdf","Paises.zip","Qualificacoes.zip","Simples.zip",
# "Empresas0.zip","Empresas1.zip","Empresas2.zip","Empresas3.zip","Empresas4.zip",
# "Empresas5.zip","Empresas6.zip","Empresas7.zip","Empresas8.zip","Empresas9.zip",
# "Estabelecimentos0.zip","Estabelecimentos1.zip","Estabelecimentos2.zip",
# "Estabelecimentos3.zip","Estabelecimentos4.zip","Estabelecimentos5.zip",
# "Estabelecimentos6.zip","Estabelecimentos7.zip","Estabelecimentos8.zip",
# "Estabelecimentos9.zip","Motivos.zip","Municipios.zip","Naturezas.zip",
# "Socios0.zip","Socios1.zip","Socios2.zip","Socios3.zip","Socios4.zip","Socios5.zip",
# "Socios6.zip","Socios7.zip","Socios8.zip","Socios9.zip"]

dirpath= "downloads/"

#uri_postgres = 'postgresql+psycopg2://postgres:postgres@192.168.15.2/cnpj'
uri_sqlite ='sqlite:///base_receitaYYYYMMDD.db'

In [None]:
def mapp(fn,data, workers=8):
  tasks=[]
  out = []
  with ThreadPoolExecutor(max_workers=workers) as tex:
    tasks=[tex.submit(fn,i) for i in data]
  for task in as_completed(tasks):
    out.append(task.result())
  return out

In [None]:
def downloader(url):
    #dirpath="downloads/"
    try:
        r = requests.get(url, stream=True)
        total = int(r.headers.get('content-length', 0))
        if "Content-Disposition" in r.headers.keys():
            fname = re.findall("filename=(.+)", r.headers["Content-Disposition"])[0]
        else:
            fname = url.split("/")[-1]
        with open(dirpath+fname, 'wb') as file, tqdm(desc=fname, total=total, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
            for data in r.iter_content(chunk_size=1024):
                size = file.write(data)
                bar.update(size)
    except Exception as inst:
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        print(inst)          # __str__ allows args to be printed directly,
                         # but may be overridden in exception subclasses
        x, y = inst.args     # unpack args
        print('x =', x)
        print('y =', y)       
    finally:
        return [str(r.status_code), url]

In [None]:
filelist = list( filter( lambda s: s.find(".")>0,
                        [tag.attrs['href'] for tag in bs(requests.get(urlbase).content).find_all('a')]))

In [None]:
urllist = [urlbase+f for f in filelist]

In [None]:
res = mapp(downloader,urllist) if input("Tem certeza? S/N").upper() == "S" else "Cancelado"

In [None]:
# se der erro em algum basta usar o downloader
# downloader('https://dadosabertos.rfb.gov.br/CNPJ/Socios3.zip')

In [None]:
dirlist= os.listdir(dirpath)
engine = sa.create_engine(uri_sqlite)
engine.connect().close() #test

In [None]:
def upload(engine, local, table, colunas):
    with engine.connect() as c:
        c.execute("DROP TABLE IF EXISTS \"" + table + "\"")
    if local.find('.zip') < 0:
        df = dd.read_csv(local, header=None, encoding='latin1', sep=";", names=colunas, dtype=str)
    else:
        df = dd.read_csv(local, header=None, encoding='latin1', sep=";", names=colunas, dtype=str, blocksize=None)
    for n in trange(df.npartitions, desc=table):
        df.get_partition(n).compute().to_sql( name=table, con=engine, if_exists= 'append', chunksize=1000, index=False)
    # with engine.connect() as c:
    #     c.execute("CREATE INDEX IF NOT EXISTS idx_" + table + " ON \"" + table + "\" (\"" + df.columns[0] + "\")")

In [None]:
local=dirpath+'Municipios.zip'
colunas=["CD_MUNICIPIO","MUNICIPIO"]
table='MUNICIPIO'
upload(engine, local, table, colunas)

In [None]:
local=dirpath+'Cnaes.zip'
colunas=["CNAE","NM_CNAE"]
table='CNAE'
upload(engine, local, table, colunas)

In [None]:
local=dirpath+'Paises.zip'
colunas=["CD_PAIS","PAIS"]
table='PAIS'
upload(engine, local, table, colunas)

In [None]:
local=dirpath+'Naturezas.zip'
colunas=["CD_NAT_JURIDICA","NAT_JURIDICA"]
table='NATJU'
upload(engine, local, table, colunas)

In [None]:
local=dirpath+'Motivos.zip'
colunas=["CD_MOTIVO_SIT_CADASTRO","MOTIVO_SIT_CADASTRO"]
table='MOTIVOSIT'
upload(engine, local, table, colunas)

In [None]:
local=dirpath+'Qualificacoes.zip'
colunas=["CD_QUALS","NM_QUALS"]
table='QUALS'
upload(engine, local, table, colunas)

In [None]:
def extract(flist,filtro):
    filenames=[]
    files=list(filter(lambda x: not x.find(filtro), dirlist))
    for filename in files:
            zip=zipfile.ZipFile(dirpath+filename)
            filenames.append(dirpath+zip.filelist[0].filename)
            zip.extractall(path=dirpath)
    return filenames

def remove(flist):
    for f in flist:
            os.remove(f)

In [None]:
lista= extract(dirlist,"Simples")
#simples (cerca de 9min)
local=dirpath+'*.SIMPLES.*'
colunas=["CNPJ_BASICO","OP_SIMPLES","DT_OP_SIMPLES","DT_EXC_SIMPLES","OP_MEI","DT_OP_MEI","DT_EXC_MEI"]
table='SIMPLES'
upload(engine, local, table, colunas)
remove(lista)

In [None]:
%%time
lista= extract(dirlist,"Empre")
#empresa
colunas = ["CNPJ_BASICO", "RAZAO_SOCIAL", "NAT_JURIDICA", "QUAL_RESP", "CAPITAL_SOCIAL", "PORTE_EMPRESA", "ENTE_FED_RESP"]
local=dirpath+'*.EMPRECSV'
table='EMPRESA'
upload(engine, local, table, colunas)
remove(lista)

In [None]:
lista= extract(dirlist,"Socios")
#socios
colunas=["CNPJ_BASICO","ID_TIPO_SOCIO","NOME_OU_RAZAO_SOCIAL","CNPJ_CPF","QUALIF_SOCIO",
         "DT_ENTRADA","CD_PAIS","REPR_LEGAL","NM_REPR","CD_QUALIF_REPR","FAIXA_ETARIA"]
local=dirpath+'*.SOCIOCSV'
table='SOCIO'
upload(engine, local, table, colunas)
remove(lista)

In [None]:
lista= extract(dirlist,"Estabelecimentos")
#estabelecimentos
colunas=["CNPJ_BASICO", "CNPJ_ORDEM", "CNPJ_DV", "CD_MATRIZ_FILIAL", "NM_FANTASIA",
         "CD_SIT_CADASTRO", "DT_SIT_CADASTRO","MOTIVO_SIT_CADASTRO","NM_CIDADE_EXT", "CD_PAIS",
         "DT_INI", "CNAE_PRINCIPAL","CNAE_SECUNDARIO",
         "TIP_LOGRADOURO","LOGRADOURO","NUMERO","COMPLEMENTO","BAIRRO","CEP","UF","MUNICIPIO",
         "DDD_1","TEL_1","DDD_2","TEL_2","DDD_FAX","FAX","EMAIL","SIT_ESP","DT_SIT_ESP"]
local=dirpath+'*.ESTABELE'
table='ESTABELE'
upload(engine, local, table, colunas)
remove(lista)

In [None]:
remove([dirpath+f for f in filter(lambda x: x.find(".zip") > 0,dirlist)]) if input("Tem certeza? S/N").upper() == "S" else "Cancelado"