In [1]:
# Dependencies for Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Dependencies -cont'd
import pandas as pd
import numpy as np

In [3]:
# Dependency -cont'd
import time

In [4]:
# Read animal web ID for urls on www.iucnredlist.org and convert it to Pandas DataFrame
animal_df = pd.read_csv('./data/rawdata/animal_species_for_scraping.csv', encoding='UTF-8')

# Preview "animal_df"
animal_df.head()

Unnamed: 0,Common_Name,Other_Name,IUCN_WebID
0,African Elephant,,12392/3339343
1,African Wild Dog,,12436/16711116
2,Albacore Tuna,,21856/9325450
3,Amazon River Dolphin,,10831/50358152
4,Amur Leopard,,15954/102421779


In [5]:
# list to store animal habitat country (cntry) after scraping
hab_cntry = []

# Loop through animal web ID
for i in range(len(animal_df)):

    # Time intervals between two adjacent query
    time.sleep(1.5)
    
    # Reset url to basic url before each iteration
    url = 'https://www.iucnredlist.org/species/'
    # Generating complete url by concatenating basic url with animal web ID
    url += str(animal_df['IUCN_WebID'][i])
    
    # Use selenium in Chrome 
    driver = webdriver.Chrome()
    # Get url of the iterated webpage
    driver.get(url)

    try:
        # Use WebDriverWait in combination with ExpectedCondition to setup implicit wait
        # In this case, it is 10s for id="geographic-range" to respond to calls before sending Exception message
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "geographic-range"))
        )

        try:
            # Search for extant countries of iterated animal using with "xpath"
            query = driver.find_elements_by_xpath("//*[@id='geographic-range']/div[1]/div[1]/div/div[1]/p")
            
            # Convert the query result to country-containing list
            cntries = query[0].text.split(';')
            
        except:
            # If no extant countries is found, assign "cntries" as '' in list
            cntries = ['']
            
        # Append extant countries of iterated animal to "hab_cntry" as an entire list
        hab_cntry.append(cntries)
        
    except:
        raise Exception('Timed out. Cannot find it..')
    
    # Clear the driver after scraping of the iterated animal
    driver.close()

# Check the length of "hab_cntry" after all scrapings are done
len(hab_cntry)

69

In [6]:
# Add animal habitat country in "animal_df"
animal_df["Habitat_Cntry"] = hab_cntry

# Delete row of "IUCN_WebID" in "animal_df"
del animal_df['IUCN_WebID']

# Preview "animal_df"
animal_df.head()

Unnamed: 0,Common_Name,Other_Name,Habitat_Cntry
0,African Elephant,,"[Angola, Benin, Botswana, Burkina Faso, Ca..."
1,African Wild Dog,,"[Angola, Benin, Botswana, Burkina Faso, Ce..."
2,Albacore Tuna,,"[Algeria, American Samoa, Angola, Anguilla,..."
3,Amazon River Dolphin,,"[Bolivia, Plurinational States of, Brazil, C..."
4,Amur Leopard,,[South Sudan]


In [7]:
animal_df.to_csv("./data/cleandata/animal_species_with_habitat_country.csv", index=False, header=True)