In [20]:
import grequests
from random import randint, sample
from lxml.html import fromstring

### Algorithm for generating random CNPJs

In [2]:
def validate_digit(mult_array, digits_array):
    sum_digits = sum(map(lambda x,y: x*y, mult_array, digits_array))
    dig = sum_digits % 11
    
    if dig < 2:
        dig = 0
    else:
        dig = 11 - dig

    return dig


def generate_cnpj():
    cnpj_12d = str(randint(1000,1e8)).zfill(8) + '0001'
    cnpj_int_l = [int(d) for d in list(cnpj_12d)]

    first_dig = validate_digit([5,4,3,2,9,8,7,6,5,4,3,2], cnpj_int_l)
    sec_dig = validate_digit([6,5,4,3,2,9,8,7,6,5,4,3,2], cnpj_int_l + [first_dig] )
    return cnpj_12d + str(first_dig) + str(sec_dig)


In [3]:
#Write each random CNPJ to a text file. These file will be used to fetch data from the receitaWS API
def write_to_txt(n, list_cnpjs):
    with open('list_cnpjs.txt', 'w') as f:
        for cnpj in list_cnpjs:
            f.write(f'{cnpj}\n')


set_cnpjs = set([generate_cnpj() for i in range(10000)])
list_cnpjs = list(set_cnpjs)
write_to_txt(10000, list_cnpjs)

In [4]:
def read_txt():
    with open('list_cnpjs.txt', 'r') as f:
        lines = f.readlines()
        return [x.strip() for x in lines]
    
list_cnpjs = read_txt()

### Free Proxies

In [28]:
import requests, json
from time import time, sleep
from pymongo import MongoClient
from bs4 import BeautifulSoup, NavigableString, Tag

In [21]:
def fetch_proxies():
    response = requests.get('https://free-proxy-list.net/')
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:300]: # this site provides up to 300 free proxies
        # We do not want transparent proxies, since they do not work to bypass APIs
        if i.xpath('.//td[5][not(contains(text(),"transparent"))]'):
            # IP + Port
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies

In [22]:
list_proxies = list(fetch_proxies())
len(list_proxies)

281

In [26]:
def exception_handler(request, exception):
    print("Request failed", exception)

def generate_requests(cnpjs, proxies):
    ## Create the list for parallel connections through free proxies
#     return [grequests.get(f'https://www.receitaws.com.br/v1/cnpj/{cnpj}', 
    return [grequests.get(f'https://www.cnpja.com.br/search-companies?taxId={cnpj}',
                          proxies = {
                            "http": f'http://{proxy}', 
                            "https": f'http://{proxy}' 
                          },
                          timeout=10) 
            for cnpj, proxy in zip(cnpjs, proxies)]


def generate_requests_noproxy(cnpjs):
    return [grequests.get(f'https://www.cnpja.com.br/search-companies?taxId={cnpj}',
                      timeout=10) 
        for cnpj, proxy in zip(cnpjs, proxies)]

In [10]:
def select_proxies(st, step, list_proxies):
    
    end = (st + step) % len(list_proxies)

    if st > end:
        return list_proxies[st:] + list_proxies[0:end] 
    else:
        return list_proxies[st:end]
    

In [15]:

def write_json_hd(r, cnpj):
    
    parser = fromstring(r.text)
    el = parser.xpath("//div[contains(@id, 'comp-k6fvlmw3')]//br")
    
    if not el:
        el_missing = parser.xpath("//div[contains(@id, 'comp-k6fvknae')]/p/span/span/span/br")
        if not el_missing:
            print(f'\nNo HTML Element found for {cnpj}')
            return
        else:
            el = el_missing
    
    with open(f'./jsons/{cnpj}.json', 'w') as f:
        f.write("{\n")
        for i in range(len(el)+1):
            s = el[0].xpath("./following::text()")[i]

            # Skip the lines regarding the email, which are protected against scripts(without js)
            if s in ['\u200c\u200c "email": "', '[email\xa0protected]' , '",']:
                continue

            s = s.replace(u'\u200c',u'').replace('\n', '')
            f.write(f"{s}\n")
        f.write("}\n")
        

In [11]:
def save_html(r, cnpj):
    with open(f'./raw_data/{cnpj}.html', 'w') as f:
        f.write(r.text)
        
        
def check_response(resp, chosen_cnpjs, list_cnpjs):
    
    for r, cnpj in zip(resp, chosen_cnpjs):
        if r:
            if r.status_code == 200:
                save_html(r, cnpj)
                #write_json_hd(r, cnpj)
                parse_bs4_save(r, cnpj)
                list_cnpjs.remove(cnpj)
        

In [24]:
def parse_bs4_save(r, cnpj):
    
    soup = BeautifulSoup(r.text, 'html.parser')
    
    soup_full_data = soup.find(id='comp-k6fvlmw3')
    soup_not_registered = soup.find(id='comp-k6fvknae')
    
    if ( soup_full_data == None) and (soup_not_registered == None):
        print(f'\nNo tags Found cnpj: {cnpj}')
        return
    
    soup_target = soup_full_data
    if soup_full_data != None:
        soup_target = soup_full_data.find_all('br')
    else:
        soup_target = soup_not_registered.find_all('br')

    with open(f'./jsons/{cnpj}.json', 'w') as f:
        f.write("{\n")
        for br in soup_target:
            next_s = br.nextSibling
            if not (next_s and isinstance(next_s,NavigableString)):
                continue
            next2_s = next_s.nextSibling

            if next2_s and isinstance(next2_s,Tag) and next2_s.name == 'br':
                text = str(next_s).strip()
                if text:
                    line = next_s.replace(u'\u200c',u'')
                    f.write(f"{line}\n")
                    
        f.write("}\n")

In [None]:
#subset of proxies and list_cnpjs
proxies = list_proxies[:150]
cnpjs = list_cnpjs[150:200]

step = 20
st = 0
count = 0

while len(cnpjs) >= step:
    
    if count % 3 == 0:
        proxies = list(fetch_proxies())
    
    start_time = time()
    chosen_cnpjs = sample(cnpjs, step)
    
    chosen_proxies = select_proxies(st, step, proxies)
    st = (st + step) % len(list_proxies)
    
    reqs = generate_requests(chosen_cnpjs, chosen_proxies)
    resp = grequests.map(reqs)
    
    check_response(resp, chosen_cnpjs, cnpjs)

    count += 1
    print(len(cnpjs), time()-start_time)
    
    sleep(3)