# Download

In [2]:
import requests
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm, trange
from bs4 import BeautifulSoup as bs
import os
import sqlalchemy as sa
import zipfile
import dask.dataframe as dd

In [3]:
# configuacoes
urlbase = "https://dadosabertos.rfb.gov.br/CNPJ/"
dirpath= "downloads/" #a path precisa existir
uri_sqlite ='sqlite:///base_receitaYYYYMMDD.db'

In [4]:
def mapp(fn,data, workers=8):
  tasks=[]
  out = []
  with ThreadPoolExecutor(max_workers=workers) as tex:
    tasks=[tex.submit(fn,i) for i in data]
  for task in as_completed(tasks):
    out.append(task.result())
  return out

In [5]:
def downloader(url):
    try:
        r = requests.get(url, stream=True)
        total = int(r.headers.get('content-length', 0))
        if "Content-Disposition" in r.headers.keys():
            fname = re.findall("filename=(.+)", r.headers["Content-Disposition"])[0]
        else:
            fname = url.split("/")[-1]
        with open(dirpath+fname, 'wb') as file, tqdm(desc=fname, total=total, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
            for data in r.iter_content(chunk_size=1024):
                size = file.write(data)
                bar.update(size)
    except Exception as inst:
        print(inst)          # __str__ allows args to be printed directly,    
    finally:
        return [str(r.status_code), url]

In [6]:
filelist = list( filter( lambda s: s.find(".")>0,
                        [tag.attrs['href'] for tag in bs(requests.get(urlbase).content).find_all('a')]))

In [7]:
urllist = [urlbase+f for f in filelist]

In [None]:
res = mapp(downloader,urllist) if input("Tem certeza? S/N").upper() == "S" else "Cancelado"

In [43]:
# se der erro em algum basta usar o downloader
# downloader(urllist[0])

# Upload

## Funções

In [8]:
dirlist= os.listdir(dirpath)
engine = sa.create_engine(uri_sqlite)
engine.connect().close() #test

In [9]:
def upload(local, table, dcol):
    colunas=list(dcol.keys())
    with engine.connect() as c:
        c.execute("DROP TABLE IF EXISTS \"" + table + "\"")
        block='default' if local.find('.zip') < 0 else None
        df = dd.read_csv(local, header=None, encoding='latin1', sep=";", decimal=",", names=colunas, dtype=dcol, blocksize=block)
    for n in trange(df.npartitions, desc=table):
        df.get_partition(n).compute().to_sql( name=table, con=engine, if_exists= 'append', chunksize=1000, index=False)
    with engine.connect() as c:
         c.execute("CREATE INDEX IF NOT EXISTS idx_" + table + " ON \"" + table + "\" (\"" + df.columns[0] + "\")")

In [13]:
def extract(flist,filtro):
    filenames=[]
    files=list(filter(lambda x: not x.find(filtro), dirlist))
    for filename in files:
            zip=zipfile.ZipFile(dirpath+filename)
            filenames.append(dirpath+zip.filelist[0].filename)
            zip.extractall(path=dirpath)
    return filenames

def remove(flist):
    for f in flist:
            os.remove(f)

## Começo do Upload

In [53]:
local=dirpath+'Municipios.zip'
dcol = {"CD_MUNICIPIO": "int","MUNICIPIO":"str"}
table='MUNICIPIO'
upload(local, table, dcol)

MUNICIPIO:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
local=dirpath+'Cnaes.zip'
dcol = {"CNAE": "int","NM_CNAE":"str"}
table='CNAE'
upload(local, table, dcol)

In [66]:
local=dirpath+'Paises.zip'
dcol = {"CD_PAIS": "int","PAIS":"str"}
table='PAIS'
upload(local, table, dcol)

PAIS:   0%|          | 0/1 [00:00<?, ?it/s]

In [70]:
local=dirpath+'Naturezas.zip'
dcol={"CD_NAT_JURIDICA": "int","NAT_JURIDICA":"str"}
table='NATJU'
upload(local, table, dcol)

NATJU:   0%|          | 0/1 [00:00<?, ?it/s]

In [73]:
local=dirpath+'Motivos.zip'
dcols={"CD_MOTIVO_SIT_CADASTRO":"int","MOTIVO_SIT_CADASTRO":"str"}
table='MOTIVOSIT'
upload(local, table, dcol)

MOTIVOSIT:   0%|          | 0/1 [00:00<?, ?it/s]

In [76]:
local=dirpath+'Qualificacoes.zip'
dcol={"CD_QUALS":"int","NM_QUALS":"str"}
table='QUALS'
upload(local, table, dcol)

QUALS:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
lista= extract(dirlist,"Simples")
#simples (cerca de 9min)
local=dirpath+'*.SIMPLES.*'
dcol={"CNPJ_BASICO":"int","OP_SIMPLES":"str","DT_OP_SIMPLES":"int","DT_EXC_SIMPLES":"int","OP_MEI":"str","DT_OP_MEI":"int","DT_EXC_MEI":"int"}
table='SIMPLES'
upload(local, table, dcol)
remove(lista)

In [147]:
%%time
lista= extract(dirlist,"Empre")
#empresa
dcol = {"CNPJ_BASICO":"int", "RAZAO_SOCIAL":"str", "NAT_JURIDICA":"int", "QUAL_RESP":"int", "CAPITAL_SOCIAL":"float",
        "PORTE_EMPRESA":"Int64", "ENTE_FED_RESP":"str"}
local=dirpath+'*.EMPRECSV'
table='EMPRESA'
upload(local, table, dcol)
remove(lista)

EMPRESA:   0%|          | 0/13 [00:00<?, ?it/s]

Wall time: 2min 19s


In [None]:
lista= extract(dirlist,"Socios")
#socios
dcol={"CNPJ_BASICO":"int","ID_TIPO_SOCIO":"int","NOME_OU_RAZAO_SOCIAL":"str","CNPJ_CPF":"str","QUALIF_SOCIO":"int",
         "DT_ENTRADA":"int","CD_PAIS":"Int64","REPR_LEGAL":"str","NM_REPR":"str","CD_QUALIF_REPR":"int","FAIXA_ETARIA":"int"}
local=dirpath+'*.SOCIOCSV'
table='SOCIO'
upload(local, table, dcol)
remove(lista)

In [None]:
lista= extract(dirlist,"Estabelecimentos")
#estabelecimentos
dcol={"CNPJ_BASICO":"int", "CNPJ_ORDEM":"int", "CNPJ_DV":"int", "CD_MATRIZ_FILIAL":"int", "NM_FANTASIA":"str",
         "CD_SIT_CADASTRO":"int", "DT_SIT_CADASTRO":"int","MOTIVO_SIT_CADASTRO":"int","NM_CIDADE_EXT":"str", "CD_PAIS":"Int64",
         "DT_INI":"int", "CNAE_PRINCIPAL":"int","CNAE_SECUNDARIO":"str",
         "TIP_LOGRADOURO":"str","LOGRADOURO":"str","NUMERO":"str","COMPLEMENTO":"str","BAIRRO":"str","CEP":"Int64","UF":"str","MUNICIPIO":"int",
         "DDD_1":"Int64","TEL_1":"Int64","DDD_2":"Int64","TEL_2":"Int64","DDD_FAX":"Int64","FAX":"str","EMAIL":"str","SIT_ESP":"Int64","DT_SIT_ESP":"Int64"}
local=dirpath+'*.ESTABELE'
table='ESTABELE'
upload(local, table, dcol)
remove(lista)

In [149]:
remove([dirpath+f for f in filter(lambda x: x.find(".zip") > 0,dirlist)]) if input("Tem certeza? S/N").upper() == "S" else "Cancelado"

Tem certeza? S/N S


## Testes

In [18]:
t=list(filter(lambda x: x.find("Estab") > 0,urllist))[0]
t

'https://dadosabertos.rfb.gov.br/CNPJ/Estabelecimentos0.zip'

In [19]:
downloader(t)

Estabelecimentos0.zip:   0%|          | 0.00/855M [00:00<?, ?iB/s]

['200', 'https://dadosabertos.rfb.gov.br/CNPJ/Estabelecimentos0.zip']

In [20]:
dirlist= os.listdir(dirpath)

In [21]:
lista= extract(dirlist,"Estabelecimentos")
local=dirpath+'*.ESTABELE'

In [22]:
colunas=["CNPJ_BASICO", "CNPJ_ORDEM", "CNPJ_DV", "CD_MATRIZ_FILIAL", "NM_FANTASIA",
         "CD_SIT_CADASTRO", "DT_SIT_CADASTRO","MOTIVO_SIT_CADASTRO","NM_CIDADE_EXT", "CD_PAIS",
         "DT_INI", "CNAE_PRINCIPAL","CNAE_SECUNDARIO",
         "TIP_LOGRADOURO","LOGRADOURO","NUMERO","COMPLEMENTO","BAIRRO","CEP","UF","MUNICIPIO",
         "DDD_1","TEL_1","DDD_2","TEL_2","DDD_FAX","FAX","EMAIL","SIT_ESP","DT_SIT_ESP"]

In [23]:
df = dd.read_csv(local, header=None, encoding='latin1', sep=";", decimal=",", names=colunas, blocksize='default', dtype=str)
df.head()

Unnamed: 0,CNPJ_BASICO,CNPJ_ORDEM,CNPJ_DV,CD_MATRIZ_FILIAL,NM_FANTASIA,CD_SIT_CADASTRO,DT_SIT_CADASTRO,MOTIVO_SIT_CADASTRO,NM_CIDADE_EXT,CD_PAIS,...,MUNICIPIO,DDD_1,TEL_1,DDD_2,TEL_2,DDD_FAX,FAX,EMAIL,SIT_ESP,DT_SIT_ESP
0,37391175,1,87,1,JC PROMOCAO,8,20200720,1,,,...,427,91,83578917,,,,,RODIMILDONJUNIOR@GMAIL.COM,,
1,37391186,1,67,1,MANIA CAPOEIRA,2,20200611,0,,,...,7107,11,46918281,,,,,MANIACAPOEIRA@GMAIL.COM,,
2,37391199,1,36,1,,4,20221125,63,,,...,5847,22,92856729,,,,,PRISCILLAPBIO@GMAIL.COM,,
3,37391210,1,68,1,L.F SIQUEIRA,2,20200611,0,,,...,9373,62,32460698,,,,,LUDSIQUEIRA20@GMAIL.COM,,
4,37391218,1,24,1,EGUA DO CALOR DEPOSITO DE BEBIDAS,2,20200612,0,,,...,427,91,32299595,,,,,FABRICIOMTE2928@GMAIL.COM,,


In [44]:
df.columns[18]

'CEP'

In [31]:
df[df.columns[17:]].head()

Unnamed: 0,BAIRRO,CEP,UF,MUNICIPIO,DDD_1,TEL_1,DDD_2,TEL_2,DDD_FAX,FAX,EMAIL,SIT_ESP,DT_SIT_ESP
0,PARQUE VERDE,66635210,PA,427,91,83578917,,,,,RODIMILDONJUNIOR@GMAIL.COM,,
1,JARDIM GUAPIRA,2281265,SP,7107,11,46918281,,,,,MANIACAPOEIRA@GMAIL.COM,,
2,PARQUE AEROPORTO,27963522,RJ,5847,22,92856729,,,,,PRISCILLAPBIO@GMAIL.COM,,
3,JARDIM ATLANTICO,74343570,GO,9373,62,32460698,,,,,LUDSIQUEIRA20@GMAIL.COM,,
4,CONDOR,66065158,PA,427,91,32299595,,,,,FABRICIOMTE2928@GMAIL.COM,,


In [53]:
# dcol = {"CNPJ_BASICO":"int", "RAZAO_SOCIAL":"str", "NAT_JURIDICA":"int", "QUAL_RESP":"int", "CAPITAL_SOCIAL":"float",
#         "PORTE_EMPRESA":"Int64", "ENTE_FED_RESP":"str"}

dcol={"CNPJ_BASICO":"int", "CNPJ_ORDEM":"int", "CNPJ_DV":"int", "CD_MATRIZ_FILIAL":"int", "NM_FANTASIA":"str",
         "CD_SIT_CADASTRO":"int", "DT_SIT_CADASTRO":"int","MOTIVO_SIT_CADASTRO":"int","NM_CIDADE_EXT":"str", "CD_PAIS":"Int64",
         "DT_INI":"int", "CNAE_PRINCIPAL":"int","CNAE_SECUNDARIO":"str",
         "TIP_LOGRADOURO":"str","LOGRADOURO":"str","NUMERO":"str","COMPLEMENTO":"str","BAIRRO":"str","CEP":"Int64","UF":"str","MUNICIPIO":"int",
         "DDD_1":"Int64","TEL_1":"Int64","DDD_2":"Int64","TEL_2":"Int64","DDD_FAX":"Int64","FAX":"str","EMAIL":"str","SIT_ESP":"Int64","DT_SIT_ESP":"Int64"}

In [56]:
df = dd.read_csv(local, header=None, encoding='latin1', sep=";", decimal=",", names=colunas, blocksize='default', dtype=dcol)
df.head()

Unnamed: 0,CNPJ_BASICO,CNPJ_ORDEM,CNPJ_DV,CD_MATRIZ_FILIAL,NM_FANTASIA,CD_SIT_CADASTRO,DT_SIT_CADASTRO,MOTIVO_SIT_CADASTRO,NM_CIDADE_EXT,CD_PAIS,...,MUNICIPIO,DDD_1,TEL_1,DDD_2,TEL_2,DDD_FAX,FAX,EMAIL,SIT_ESP,DT_SIT_ESP
0,37391175,1,87,1,JC PROMOCAO,8,20200720,1,,,...,427,91,83578917,,,,,RODIMILDONJUNIOR@GMAIL.COM,,
1,37391186,1,67,1,MANIA CAPOEIRA,2,20200611,0,,,...,7107,11,46918281,,,,,MANIACAPOEIRA@GMAIL.COM,,
2,37391199,1,36,1,,4,20221125,63,,,...,5847,22,92856729,,,,,PRISCILLAPBIO@GMAIL.COM,,
3,37391210,1,68,1,L.F SIQUEIRA,2,20200611,0,,,...,9373,62,32460698,,,,,LUDSIQUEIRA20@GMAIL.COM,,
4,37391218,1,24,1,EGUA DO CALOR DEPOSITO DE BEBIDAS,2,20200612,0,,,...,427,91,32299595,,,,,FABRICIOMTE2928@GMAIL.COM,,


In [57]:
remove(lista)