In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
import os
import os.path
import json
import sys
from copy import deepcopy
import threading
import queue

In [None]:
from selenium.common.exceptions import NoSuchElementException        

def checking_css_selector(driver, css_selector):
    try:
        driver.find_element_by_css_selector(css_selector)
    except NoSuchElementException:
        return False
    return True

In [None]:
import csv
import os.path
import errno

# Creates file path and file if needed
def creating_files(file_path, csv_file, csv_columns):
    # Checking if file path exist, if not, create it
    if not os.path.exists(file_path):
        try:
            os.makedirs(file_path)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
    # else:
    #     print("File path already exists")
    # Creating file and columns variables of the csv file
    if os.path.isfile(os.path.join(file_path, csv_file)) is False:
        try:
            with open(os.path.join(file_path, csv_file), 'w', encoding='utf-8', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
                writer.writeheader()
                csvfile.close()
        except IOError:
            print("I/O error")
    else:
        print("File already exists")

In [None]:
def write_data_row(file, csv_columns, data_list):
    try:
        with open(file, 'a', encoding='utf-8', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            for doctor in data_list:
                dict_data = {
                            csv_columns[0]: doctor["name"],
                            csv_columns[1]: doctor["specialties"],
                            csv_columns[2]: doctor["skills"],
                            csv_columns[3]: doctor["state"],
                            csv_columns[4]: doctor["city"],
                            csv_columns[5]: doctor["phone"],
                }
                writer.writerow(dict_data)
            csvfile.close()
    except IOError:
        print("I/O error: ", sys.exc_info())
        raise

In [None]:
def crawling_data(driver, path, csv_columns):
    #  creating data_list
    data_list = []
    #  creatind data dictionray. Pandas could be used here. Nevertheless I prefer to create the data structure
    data_dict = {'name': '', 'specialties': '', 'skills': '', 'state': '', 'city': '', 'phone': ''}
    doctors_list = []
    url = driver.current_url
    soup_level2 = BeautifulSoup(driver.page_source)
    
    try:
        #  getting all doctor urls from page
        for link in soup_level2.findAll('a', class_="rank-element-name__link"):
            doctors_list.append(link.get('href'))
        
        #  crawling data for each doctor 
        for doctor in doctors_list:
            driver.get(doctor)
            soup_level3 = BeautifulSoup(driver.page_source)
            
            #  crawling doctor name
            if soup_level3.find('div', class_="unified-doctor-header-info__name"):
                if (soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() == 
                    soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()):
                    data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText()
                else:
                    data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() + ' ' + soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()
            
            #  crawling doctor specialties
            temp_specialties = []
            if soup_level3.find(class_="h4 text-muted text-base-weight offset-bottom-0"):
                h2 = soup_level3.find('h2', class_='h4 text-muted text-base-weight offset-bottom-0')
                for specialties  in h2.findAll('a', class_= "text-muted"):
                    temp_specialties.append(specialties['title'])
                if len(temp_specialties) != 0:
                    data_dict['specialties'] = temp_specialties
                else:
                    temp_specialties.append(h2.find('span').getText())
                    data_dict['specialties'] = temp_specialties
            del temp_specialties

            #  crawling city and state and removing duplicants
            temp_city =[]
            temp_state =[]
            if soup_level3.findAll('h5', class_="offset-0"):
                for h5 in soup_level3.findAll('h5', class_="offset-0"):
                    for city in h5.findAll('span', class_="city"):
                        if city['content'] not in temp_city:
                            temp_city.append(city['content'])
                    for state in h5.findAll('span', class_='province region'):
                        if state['content'][-2:] not in temp_state:
                            temp_state.append(state['content'][-2:])
                if len(temp_city) != 0:
                    data_dict['city'] = temp_city
                if len(temp_state) != 0:
                    data_dict['state'] = temp_state
            del temp_city
            del temp_state
            
            #  crawling skills
            temp_skills = []
            if soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
                for link in soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
                    temp_skills.append(link.getText().strip())
                data_dict['skills'] = temp_skills
            del temp_skills
            
            #  crawling phones and removing duplicants
            temp_phone = []
            if soup_level3.findAll('a', class_= "text-muted padding-left-2"):
                for phone in soup_level3.findAll('a', class_= "text-muted padding-left-2"):
                    if phone['href'][4:] not in temp_phone:
                        temp_phone.append(phone['href'][4:])
                data_dict['phone'] = temp_phone
            del temp_phone
            
            data_list.append(deepcopy(data_dict))
            
        #  cleaning data
        del data_dict
        del doctors_list          
        
        write_data_row(path, csv_columns, data_list)
        del data_list
        
        #  moving to next page recursively
        driver.get(url)
        del url
        while(checking_css_selector(driver, '.next')):
            driver.get(soup_level2.find('li', class_="next").find('a')['href'])
            crawling_data(driver, path, csv_columns)   
    
    except:
        print("Unexpected error:", sys.exc_info()[0])
        raise
    
    return True


In [None]:
# crawling function
def crawling_specialties(url, path, csv_columns):
    #  create a new Firefox session
    #  driver = webdriver.Firefox(executable_path = '/geckodriver')
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)
    driver.get(url)
    crawling_data(driver, path, csv_columns)
    driver.close()
    del driver
    return True

In [None]:
#  threaded function for queue processing.
def crawl(q, csv_columns):
    while not q.empty():
        tp = q.get()  #  fetch new work from the Queue
        try:
            driver = webdriver.Firefox()
            driver.implicitly_wait(30)
            driver.get(tp[1])  #  url
            crawling_data(driver, tp[0], csv_columns)  # tp[0] correspond to te file path
            driver.close()
        except:
            #logging.error('Error with URL check!')
            print('Error with URL check!: '+ str(tp[1]))
        #signal to the queue that task has been processed
        q.task_done()
    return True

In [None]:
#  launch global variables
url = "https://www.doctoralia.com.br"
main_page = "/especializacoes-medicas"
#geckodriver_path = "geckodriver"

#  subtree of medical_specialties
medical_specialties = []

#  csv variables
csv_path = "csv_files"
csv_columns = ['Nome',
               'Especialidade(s)',
               'Competência(s)',
               'Estado',
               'Cidade',
               'Telefone'
              ]

In [None]:
#  create a new Firefox session
#  driver = webdriver.Firefox(executable_path = '/geckodriver')
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url+main_page)

#  getting all medical specialties
soup_level1 = BeautifulSoup(driver.page_source)
for link in soup_level1.findAll('a', class_="text-muted"):
    if link.get('href').count('/') == 1:
        medical_specialties.append(link.get('href'))
#  medical_specialties
driver.close()
del driver

In [None]:
for specialtie in medical_specialties:
    creating_files(csv_path, specialtie[1:] + ".csv", csv_columns)

In [None]:
#  threads
num_threads = min(5, len(medical_specialties))
q = queue.Queue(maxsize=0)

for i in range(len(medical_specialties)):
    #need the index and the url in each queue item.
    q.put((os.path.join(csv_path, medical_specialties[i][1:] + ".csv"), url+medical_specialties[i]))

#for i in range(num_threads):
#    tp = q.get(i)
#    print(tp[0])
#    print(tp[1])
#    print(type(q.get(i)))

In [None]:
#  load up the queue with the urls to fetch and the index for each job (as a tuple):
for i in range(num_threads):
    #logging.debug('Starting thread ', i)
    print('Starting thread ', i)
    #print('   ' + str(url+medical_specialties[1]))
    worker = threading.Thread(target=crawl, args=(q, csv_columns))
    #worker.setDaemon(True)     
    worker.start()
q.join()
    #  need the index and the url in each queue item.
    #q.put((i,medical_specialties[i]))

In [None]:
x1 = threading.Thread(target=crawling_specialties, args=(url+medical_specialties[0], 
                                                         os.path.join(csv_path, medical_specialties[0][1:] + ".csv"), 
                                                         csv_columns))
#  append thread to list
list_thread.append(x1)
#  start thread
x1.start()

x2 = threading.Thread(target=crawling_specialties, args=(url+medical_specialties[1], 
                                                         os.path.join(csv_path, medical_specialties[1][1:] + ".csv"), 
                                                         csv_columns))
#  append thread to list
list_thread.append(x2)
#  start thread
x2.start()


#  wait until threads finish their job
for th in list_thread:
    th.join()

In [None]:
for idx, item in enumerate(medical_specialties):
    x = threading.Thread(target=crawling_specialties, args=(url+medical_specialties[idx], 
                                                            os.path.join(csv_path, item[1:] + ".csv"), 
                                                            csv_columns))
    #  setting a thread name for debug
    x.setName('Thread name: ' + item + '. Thread number: ' + str(idx))
    #  append thread to list
    list_thread.append(x)
    #  start thread
    x.start()

#  wait until threads finish their job
for th in list_thread:
    th.join()

# TESTs

In [None]:
soup_level3 = BeautifulSoup(driver.page_source)

if soup_level3.find(class_="unified-doctor-header-info__name"):
        if (soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() == 
            soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()):
            print(soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText())
        else:
            print(soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() + ' ' + soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText())
            

print('---')

h2 = soup_level3.find('h2', class_='h4 text-muted text-base-weight offset-bottom-0')
print(h2)
for specialties in h2.findAll('a', class_= "text-muted"):
        print(specialties['title'])

print('---')
temp_city =[]
temp_state =[]

#div = soup_level3.find('div', class_= 'panel-body')
#div = soup_level3.find(lambda tag: tag.name == 'div' and tag['class'] == ['panel-body'])
div = soup_level3.find(lambda tag: tag.name == 'div' and tag.get('class') == ['panel-body'])
for h5 in soup_level3.findAll('h5', class_="offset-0"):
    for city in h5.findAll('span', class_="city"):
        print(city['content'])
    for state in h5.findAll('span', class_='province region'):
        print(state['content'][-2:])
#for link in soup_level3.findAll('span', class_="province region"):
#    if link['content'][:-3] not in temp_city:
#        temp_city.append(link['content'][:-3])
#    if link['content'][-2:] not in temp_state:
#        temp_state.append(link['content'][-2:])
#print(temp_city, temp_state)
print('---')
del temp_city
del temp_state
temp_skills = []
for link in soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
    temp_skills.append(link.getText().strip())
print(temp_skills)
print('---')
del temp_skills
temp_phone = []
for phone in soup_level3.findAll('a', class_= "text-muted padding-left-2"):
    if phone['href'][4:] not in temp_phone:
        temp_phone.append(phone['href'][4:])
print(temp_phone)
del temp_phone

#results = r.findall(soup_level3.find('div', class_= "modal fade").getText().strip())
#for x in results:
#        print(x)

        #print(soup_level3.find('div', class_= "modal fade").getText().strip())
#print(soup_level3.findAll('button', class_="btn btn-sm btn-default"))
#print(soup_level3.findAll('a', class_= "text-muted padding-left-2"))

In [None]:
data_list = []
data_dict = {'name': '', 'specialties': '', 'skills': '', 'state': '', 'city': '', 'phone': ''}
    
try:
    soup_level3 = BeautifulSoup(driver.page_source)
    #  crawling doctor name
    if soup_level3.find(class_="unified-doctor-header-info__name"):
        if (soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() == 
            soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()):
            data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText()
        else:
            data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() + ' ' + soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()
            
    #  crawling doctor specialties
    temp_specialties = []
    if soup_level3.find('h2', class_="h4 text-muted text-base-weight offset-bottom-0"):
        h2 = soup_level3.find('h2', class_='h4 text-muted text-base-weight offset-bottom-0')
        for specialties  in h2.findAll('a', class_= "text-muted"):
            temp_specialties.append(specialties['title'])
        if len(temp_specialties) != 0:
            data_dict['specialties'] = temp_specialties
        else:
            temp_specialties.append(h2.find('span').getText())
            data_dict['specialties'] = temp_specialties
    del temp_specialties

    #  crawling city and state and removing duplicants
    temp_city =[]
    temp_state =[]
    if soup_level3.findAll('h5', class_="offset-0"):
        for h5 in soup_level3.findAll('h5', class_="offset-0"):
            for city in h5.findAll('span', class_="city"):
                if city['content'] not in temp_city:
                    temp_city.append(city['content'])
            for state in h5.findAll('span', class_='province region'):
                if state['content'][-2:] not in temp_state:
                    temp_state.append(state['content'][-2:])
        data_dict['city'] = temp_city
        data_dict['state'] = temp_state
    del temp_city
    del temp_state
   
    #  crawling skills
    temp_skills = []
    if soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
        for link in soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
            temp_skills.append(link.getText().strip())
        data_dict['skills'] = temp_skills
    del temp_skills
            
    #  crawling phones and removing duplicants
    temp_phone = []
    if soup_level3.findAll('a', class_= "text-muted padding-left-2"):
        for phone in soup_level3.findAll('a', class_= "text-muted padding-left-2"):
            if phone['href'][4:] not in temp_phone:
                temp_phone.append(phone['href'][4:])
        data_dict['phone'] = temp_phone
    del temp_phone
            
    data_list.append(deepcopy(data_dict))
            
    #  cleaning data
    del data_dict  
except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

for data in data_list:
    print(data)

In [None]:
def crawling_data(driver, data_list = None):
    #  creating data_list if doesnt exist
    if data_list is None:
        data_list = []
    #  creatind data dictionray. Pandas could be used here. Nevertheless I prefer to create the data structure
    data_dict = {'name': '', 'specialties': '', 'skills': '', 'state': '', 'city': '', 'phone': ''}
    doctors_list = []
    url = driver.current_url
    soup_level2 = BeautifulSoup(driver.page_source)
    
    try:
        #  getting all doctor urls from page
        for link in soup_level2.findAll('a', class_="rank-element-name__link"):
            doctors_list.append(link.get('href'))
        
        #  crawling data for each doctor 
        for doctor in doctors_list:
            driver.get(doctor)
            soup_level3 = BeautifulSoup(driver.page_source)
            
            #  crawling doctor name
            if soup_level3.find('div', class_="unified-doctor-header-info__name"):
                if (soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() == 
                    soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()):
                    data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText()
                else:
                    data_dict['name'] = soup_level3.find('div', class_="unified-doctor-header-info__name").find('span').getText() + ' ' + soup_level3.find('div', class_="unified-doctor-header-info__name").find('span', itemprop="name").getText()
            
            #  crawling doctor specialties
            temp_specialties = []
            if soup_level3.find(class_="h4 text-muted text-base-weight offset-bottom-0"):
                h2 = soup_level3.find('h2', class_='h4 text-muted text-base-weight offset-bottom-0')
                for specialties  in h2.findAll('a', class_= "text-muted"):
                    temp_specialties.append(specialties['title'])
                if len(temp_specialties) != 0:
                    data_dict['specialties'] = temp_specialties
                else:
                    temp_specialties.append(h2.find('span').getText())
                    data_dict['specialties'] = temp_specialties
            del temp_specialties

            #  crawling city and state and removing duplicants
            temp_city =[]
            temp_state =[]
            if soup_level3.findAll('h5', class_="offset-0"):
                for h5 in soup_level3.findAll('h5', class_="offset-0"):
                    for city in h5.findAll('span', class_="city"):
                        if city['content'] not in temp_city:
                            temp_city.append(city['content'])
                    for state in h5.findAll('span', class_='province region'):
                        if state['content'][-2:] not in temp_state:
                            temp_state.append(state['content'][-2:])
                if len(temp_city) != 0:
                    data_dict['city'] = temp_city
                if len(temp_state) != 0:
                    data_dict['state'] = temp_state
            del temp_city
            del temp_state
            
            #  crawling skills
            temp_skills = []
            if soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
                for link in soup_level3.findAll('p', class_="offset-bottom-0 offset-right-1"):
                    temp_skills.append(link.getText().strip())
                data_dict['skills'] = temp_skills
            del temp_skills
            
            #  crawling phones and removing duplicants
            temp_phone = []
            if soup_level3.findAll('a', class_= "text-muted padding-left-2"):
                for phone in soup_level3.findAll('a', class_= "text-muted padding-left-2"):
                    if phone['href'][4:] not in temp_phone:
                        temp_phone.append(phone['href'][4:])
                data_dict['phone'] = temp_phone
            del temp_phone
            
            data_list.append(deepcopy(data_dict))
            
        #  cleaning data
        del data_dict
        del doctors_list
        
        #  moving to next page recursively
        print("The length of list is: ", len(data_list)) 
        driver.get(url)
        del url
        while(checking_css_selector(driver, '.next')):
            driver.get(soup_level2.find('li', class_="next").find('a')['href'])
            crawling_data(driver, data_list)   
    
    except:
        print("Unexpected error:", sys.exc_info()[0])
        raise
    
    return data_list

In [None]:
# threading function
def crawling_specialties(url, path, csv_columns):
    #  create a new Firefox session
    #  driver = webdriver.Firefox(executable_path = '/geckodriver')
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)
    driver.get(url)
    data_list = crawling_data(driver)
    write_data_row(path, csv_columns, data_list)
    del data_list
    driver.close()
    return True