In [1]:
# Dependencies for Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Module used to connect Python to MongoDB
import pymongo

In [3]:
# Dependencies -cont'd
import pandas as pd

In [4]:
# Dependency -cont'd
import time

In [5]:
# Read animal web ID for urls on www.iucnredlist.org and convert it to Pandas DataFrame
animal_df = pd.read_csv('./data/rawdata/animal_species_for_scraping.csv', encoding='UTF-8')

# Preview "animal_df"
animal_df.head()

Unnamed: 0,Common_Name,Other_Name,IUCN_WebID
0,African Elephant,,12392/3339343
1,African Wild Dog,,12436/16711116
2,Albacore Tuna,,21856/9325450
3,Amazon River Dolphin,,10831/50358152
4,Amur Leopard,,15954/102421779


In [6]:
# Setup connection to MongoDB using default port 27017
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [7]:
# Establish "EndangeredAnimalDB" Database and "habitat_cntry" Collection
db = client.EndangeredAnimalDB
col = db.habitat_cntry

In [8]:
# Dict to store the last "div" position in xpath for animal's native extant (resident) country on www.iucnredlist.org
# Note that numbers stored in the lists are the index of "animal_df"
extant_resident = {'pos_1': [0, 1, 2, 3, 4, 5, 8, 9, 10, 12, 14, 15, 16, 17, 19, 22, 23, 24, 25, 26, 28, 33, 34, 36, 40, 41, 43, 44, 49, 51, 52, 53, 55, 56, 58, 60, 63, 64, 65, 67, 68],
                   'pos_2': [18, 20, 50, 61],
                   'pos_4': [45, 47]
                  }

# Dict to store the last "div" position in xpath for animal's native extant country on www.iucnredlist.org
extant = {'pos_1': [6, 7, 11, 13, 21, 29, 30, 31, 32, 35, 37, 39, 42, 45, 46, 48, 54, 57, 62],
          'pos_2': [27, 38, 47],
          'pos_3': [59, 66],
          'pos_5': [18]
         }

In [9]:
# Loop through the last "div" position for "extant_resident"
for p in [1, 2, 4]:
    
    # Concatenate "pos" for the key in "extant_resident"
    pos = 'pos_' + str(p)
    
    # List value in "extant_resident" with iterated key
    im = extant_resident[pos]
    
    # Loop through "im"
    for i in im:

        # Time intervals between two adjacent query
        time.sleep(1.5)

        # Reset url to basic url before each iteration
        url = 'https://www.iucnredlist.org/species/'
        # Generating complete url by concatenating basic url with animal web ID
        url += str(animal_df['IUCN_WebID'][i])

        # Use selenium in Chrome 
        driver = webdriver.Chrome()
        # Get url of the iterated webpage
        driver.get(url)

        try:
            # Use WebDriverWait in combination with ExpectedCondition to setup implicit wait
            # In this case, it is 10 sec for id="geographic-range" to respond to calls before sending Exception message
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "geographic-range"))
            )

            try:
                # Search for extant countries of iterated animal using with "xpath"
                query = driver.find_elements_by_xpath("//*[@id='geographic-range']/div[1]/div[1]/div/div[" + str(p) + "]/p")
                                                    
                # Convert the query result to country-containing list
                cntries_r = query[0].text.split(';')

                # Remove space appearing in front of country name starting from the second one in the list
                for j in range(1, len(cntries_r)):
                    cntries_r[j] = cntries_r[j][1:]

            except:
                # If no extant countries is found, assign "cntries" as '' in list
                cntries_r = ['']

            # Save document into MongoDB 
            col.insert_one(
                {
                    'Common_Name': animal_df['Common_Name'][i],
                    'Native_Extant_Resident_Cntry': cntries_r
                })

            # Create "Other_Name" tag for animals with other name
            if str(animal_df['Other_Name'][i]) != "nan":
                col.update_one(
                    {'Common_Name': animal_df['Common_Name'][i]},
                    {'$set':
                        {'Other_Name': animal_df['Other_Name'][i]}
                    }
                    )

        except:
            raise Exception('Timed out. Cannot find it..')

        # Clear the driver after scraping of the iterated animal
        driver.close()

In [10]:
# Loop through the last "div" position for "extant"
for p in [1, 2, 3, 5]:
    
    # Concatenate "pos" for the key in "extant"
    pos = 'pos_' + str(p)
    
    # List value in "extant" with iterated key
    im = extant[pos]
    
    # Loop through "im"
    for i in im:

        # Time intervals between two adjacent query
        time.sleep(1.5)

        # Reset url to basic url before each iteration
        url = 'https://www.iucnredlist.org/species/'
        # Generating complete url by concatenating basic url with animal web ID
        url += str(animal_df['IUCN_WebID'][i])

        # Use selenium in Chrome 
        driver = webdriver.Chrome()
        # Get url of the iterated webpage
        driver.get(url)

        try:
            # Use WebDriverWait in combination with ExpectedCondition to setup implicit wait
            # In this case, it is 10 sec for id="geographic-range" to respond to calls before sending Exception message
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "geographic-range"))
            )

            try:
                # Search for extant countries of iterated animal using with "xpath"
                query = driver.find_elements_by_xpath("//*[@id='geographic-range']/div[1]/div[1]/div/div[" + str(p) + "]/p")
                                                    
                # Convert the query result to country-containing list
                cntries = query[0].text.split(';')

                # Remove space appearing in front of country name starting from the second one in the list
                for j in range(1, len(cntries)):
                    cntries[j] = cntries[j][1:]

            except:
                # If no extant countries is found, assign "cntries" as '' in list
                cntries = ['']

            # Save document into MongoDB, update if document already exists
            col.update_one(
                {'Common_Name': animal_df['Common_Name'][i]},
                {'$set':
                    {'Common_Name': animal_df['Common_Name'][i],
                    'Native_Extant_Cntry': cntries}},
                 upsert=True)

            # Create "Other_Name" tag for animals with other name
            if str(animal_df['Other_Name'][i]) != "nan":
                col.update_one(
                    {'Common_Name': animal_df['Common_Name'][i]},
                    {'$set':
                        {'Other_Name': animal_df['Other_Name'][i]}
                    }
                    )

        except:
            raise Exception('Timed out. Cannot find it..')

        # Clear the driver after scraping of the iterated animal
        driver.close()