In [1]:
# Dependencies for Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Module used to connect Python to MongoDB
import pymongo

In [3]:
# Dependencies -cont'd
import pandas as pd

In [4]:
# Dependency -cont'd
import time

In [5]:
# Read animal web ID for urls on www.iucnredlist.org and convert it to Pandas DataFrame
animal_df = pd.read_csv('../../data/animal_iucn_webid.csv', encoding='UTF-8')

# Preview "animal_df"
animal_df.head()

Unnamed: 0,Common_Name,IUCN_WebID
0,African Elephant,12392/3339343
1,African Wild Dog,12436/16711116
2,Albacore Tuna,21856/9325450
3,Amazon River Dolphin,10831/50358152
4,Amur Leopard,15954/102421779


In [6]:
# Setup connection to MongoDB using default port 27017
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [7]:
# Establish "EndangeredAnimalDB" Database and "animal_facts" Collection
db = client.EndangeredAnimalDB
coll = db.animal_facts

In [8]:
# Basic url
basic_url = 'https://www.iucnredlist.org/species/'

In [None]:
# Loop through animals in "animal_df"
for i in range(len(animal_df)):
    
    # Time intervals between two adjacent query
    time.sleep(1.5)
    
    # Generating complete url by concatenating basic url with animal web ID
    url = basic_url + animal_df['IUCN_WebID'][i]    

    # Use selenium in Chrome
    driver = webdriver.Chrome(executable_path=r"../chromedriver.exe")
    
    # Get url of the iterated webpage
    driver.get(url)

    try:
        # Use WebDriverWait in combination with ExpectedCondition to setup implicit wait
        # In this case, it is 10 sec for id="geographic-range" to respond to calls before sending Exception message
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "geographic-range"))
        )
        
        # Pass iterated animal name to "dict_im"
        dict_im = {'Common_Name': animal_df['Common_Name'][i]}
        
        # Retrieve all 'h4' and 'p' elements under "<div class='u--margin-top-sm'>" as located by xpath
        for j in range(1, 3):            
            for k in range(2, 7):
                
                try:
                    # Retrieve 'h4' element during each iteration
                    h4_query = driver.find_elements_by_xpath(f"//*[@id='geographic-range']/div[1]/div[{str(j)}]/div/div[{str(k)}]/h4")
                    # Retrieve 'p' element during each iteration
                    p_query = driver.find_elements_by_xpath(f"//*[@id='geographic-range']/div[1]/div[{str(j)}]/div/div[{str(k)}]/p")                             
                    
                    # Retrieve text of 'h4' element
                    h4_split_list = h4_query[0].text.split(" ")

                    # Look for 'h4' element text starting with "Extant"
                    if h4_split_list[0] == "Extant":

                        # Convert the query result to country-containing list
                        cntries = p_query[0].text.split(";")

                        # Remove space appearing in front of country name starting from the second one in the list
                        for l in range(1, len(cntries)):
                            cntries[l] = cntries[l][1:]

                        # Add result in "dict_im"
                        dict_im[h4_query[0].text] = cntries
                        
                except IndexError:
                    pass
                
        # Save document into MongoDB 
        coll.insert_one(dict_im)

    except:
        raise Exception('Timed out. Cannot find it..')

    # Clear the driver after scraping of the iterated animal
    driver.close()