# Web Scraping with BS4 and Selenium

In [1]:
import requests
import bs4
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

## Soup Fucntions for HTML Parsing

The majority of the work in this project went into inspecting the website of the desired information and creating these functions. The following functions are the ones responsible for grabbing the website elements and extracting the details I want for the document. 

In [2]:
# id #
# class .

def common_name():
    common_names_soup = soup.select('.tax')
    common_names = common_names_soup[1].getText()
    return common_names

def scientific_name():
    scientific_name_soup = soup.select('.tax_sn')
    scientific_name = scientific_name_soup[0].getText()
    return scientific_name

def family_name():
    family_name_soup = soup.select('.tax')
    family_name = family_name_soup[2].getText()
    return family_name

def plant_type():
    plant_type_soup = soup.select('div')
    start_index = plant_type_soup[19].text.find('Habit:')
    if start_index == -1:
        plant_type = 'NA'
    else:
        plant_type = plant_type_soup[19].text[start_index+7:]
        plant_type = plant_type.split()
        plant_type = plant_type[0]
    return plant_type

def leaf_retention():
    leaf_retention_soup = soup.select('div')
    start_index = leaf_retention_soup[19].text.find('Retention:')
    if start_index == -1:
        leaf_retention = 'NA'
    else:
        leaf_retention = leaf_retention_soup[19].text[start_index+10:]
        leaf_retention = leaf_retention.split()
        leaf_retention = leaf_retention[0]
    return leaf_retention

def light():
    try:
        light_soup = soup.find_all('div')
        start_index = light_soup[22].text.find('Requirement:')
        end_index = light_soup[22].text.find('Soil Moisture:')
        light = light_soup[22].text[start_index+14:end_index-1]
    except IndexError:
        light = 'NA'
    return light

def water_use():
    try:
        water_use_soup = soup.find_all('div')
        start_index = water_use_soup[22].text.find('Water Use:')
        end_index = water_use_soup[22].text.find('Requirement:')
        if start_index == -1:
            water_use = 'NA'
        else:
            water_use = water_use_soup[22].text[start_index+10:end_index]
            water_use = water_use.strip()
    except IndexError:
        water_use = 'NA'
    return water_use

def size():
    size_class_soup = soup.find_all('div')
    size_class = size_class_soup[19].text[-10:]
    start_index = size_class_soup[19].text.find('Size Class:')
    size_class = size_class_soup[19].text[start_index+13:]
    size_class = size_class.strip()
    return size_class

def color():
    bloom_color_soup = soup.find_all('div')
    start_index = bloom_color_soup[20].text.find('Color')
    end_index = bloom_color_soup[20].text.find('Bloom Time:')
    bloom_color = bloom_color_soup[20].text[start_index+8:end_index-1]
    return bloom_color

def wildlife():
    try:
        wildlife_soup = soup.find_all('div')
        start_index = wildlife_soup[23].text.find('Wildlife:')
        if start_index == -1:
            wildlife = 'NA'
        else:
            wildlife = wildlife_soup[23].text[start_index+11:]
            wildlife = re.split(r"\.", wildlife)[0]
    except IndexError:
        wildlife = 'NA'
    return wildlife

def deer_resist():
    try:
        deer_resist_soup = soup.find_all('div')
        start_index = deer_resist_soup[23].text.find('Deer Resistant:')
        if start_index == -1:
            deer_resist = 'NA'
        else:
            deer_resist = deer_resist_soup[23].text[start_index+16:]
            deer_resist = deer_resist.strip()
    except IndexError:
        deer_resist = 'NA'
    return deer_resist

def maintenance():
    try:
        maintenance_soup = soup.find_all('div')
        start_index = maintenance_soup[24].text.find('Maintenance:')
        if start_index == -1:
            maintenance = 'NA'
        else:
            maintenance = maintenance_soup[24].text[start_index+13:]
            maintenance = maintenance.strip()
    except IndexError:
        maintenance = 'NA'
    return maintenance

def soil_moisture():
    try:
        soil_moisture_soup = soup.find_all('div')
        start_index = soil_moisture_soup[22].text.find('Soil Moisture:')
        if start_index == -1:
            soil_moisture = 'NA'
        else:
            soil_moisture = soil_moisture_soup[22].text[start_index+16:]
            soil_moisture = re.split(r"\:", soil_moisture)
            soil_moisture = soil_moisture[0]
            soil_moisture = soil_moisture.split()[0]
    except IndexError:
        soil_moisture = 'NA'
    return soil_moisture

def drought_tolerant():
    try:
        drought_soup = soup.find_all('div')
        start_index = drought_soup[22].text.find('Drought Tolerance:')
        end_index = drought_soup[22].text.find('Cold Tolerant:')
        if start_index == -1:
            drought = 'NA'
        else:
            drought = drought_soup[22].text[start_index+18:end_index]
            drought = drought.strip()
    except IndexError:
        drought = 'NA'
    return drought

def cold_tolerant():
    try:
        cold_soup = soup.find_all('div')
        start_index = cold_soup[22].text.find('Cold Tolerant:')
        end_index = cold_soup[22].text.find('Heat Tolerant:')
        if start_index == -1:
            cold = 'NA'
        else:
            cold = cold_soup[22].text[start_index+16:]
            cold = re.split(r"\:", cold)[0]
            if cold.startswith('y'):
                cold = 'yes'
            else:
                cold = 'no'
    except IndexError:
        cold = 'NA'
    return cold
    
def hot_tolerant():
    try: 
        hot_soup = soup.find_all('div')
        start_index = hot_soup[22].text.find('Heat Tolerant:')
        end_index = hot_soup[22].text.find('Soil Description:')
        if start_index == -1:
            hot = 'NA'
        else:
            hot = hot_soup[22].text[start_index+15:end_index]
            hot = hot.strip()
    except IndexError:
        hot = 'NA'
    return hot

In [3]:
headers = ['Plant name','Scientific name','Family name','Plant Type','Retention','Light','Water Usage',
           'Size','Color','Wildlife','Deer Resist','Maintenance','Soil Moisture','Drought Resist','Cold Resist','Heat Resist']

In [4]:
def makePlantDict():
    plantDict = {headers[0]:common_name(),headers[1]:scientific_name(),headers[2]:family_name(),
                 headers[3]:plant_type(),headers[4]:leaf_retention(),headers[5]:light(),headers[6]:water_use(),
                 headers[7]:size(),headers[8]:color(),headers[9]:wildlife(),headers[10]:deer_resist(),
                 headers[11]:maintenance(),headers[12]:soil_moisture(),headers[13]:drought_tolerant(),
                 headers[14]:cold_tolerant(),headers[15]:hot_tolerant()}
    return plantDict

# Selenium Web Driver

### Open Browser and URL with Selenium Web Driver

In [5]:
browser = webdriver.Firefox(executable_path=r'/Users/julio0703/Downloads/geckodriver')

In [6]:
browser.get(url='https://www.wildflower.org/collections/collection.php?start=0&collection=TX_central&pagecount=10&pagecount=100')

In [7]:
plantList = []

### Web Scraper. 

The crawler is a while loop with the scraping functions inside. Since most of the regions only have between 100 and 200 plants, the crawler only iterates twice. This is how this particular crawler works...

1. while loop with two iterations of the page with plant profiles.
2. grab the css selector linked to a detail page. Add 1 to the 'tr:nth-child(i+1) to grab the next iteration CSS seletor
3. Find the element of that selector .
4. Click on that element with the webdriver.
5. Grab the url of the detail page and pass it into request.get()
6. make the HTML soup for that particular detail page.
7. Grab the information with makePlantDict() and append the dictionary to the plantList
8. click back out of the detail page and onto the plant list page
9. Go back to step 2.

After 100 iterations, Add 1 to count and loop 1 more time (100 iterations of the scrap series). If error NoSuchElementExcetion, break the while loop, print finished.

In [8]:
count = 0
while count < 2:
    for i in range(100):
        try:
            css_selector = '.search_results > table:nth-child(5) > tbody:nth-child(1) > tr:nth-child(' + str(2+i) + ') > td:nth-child(1) > a:nth-child(1) > i:nth-child(1)'
            element = browser.find_element_by_css_selector(css_selector)
            element.click()
            urlString = browser.current_url
            requestURL = requests.get(urlString)
            soup = bs4.BeautifulSoup(requestURL.text,'lxml')
            plantList.append(makePlantDict())
            browser.back()
        except NoSuchElementException:
            print('finished')
            break
    count += 1
    next_page_element = browser.find_element_by_xpath('/html/body/main/div/div[1]/div[2]/div[1]/div[1]/a[2]')
    next_page_element.click()

finished


In [9]:
browser.quit()

### How does the data look?

In [10]:
plantList[0]

{'Cold Resist': 'NA',
 'Color': 'White ,  Pink',
 'Deer Resist': 'Moderate',
 'Drought Resist': 'NA',
 'Family name': 'Asteraceae (Aster Family)',
 'Heat Resist': 'NA',
 'Light': 'Sun',
 'Maintenance': 'Blooms  best  and  appears  bushier  if  severely  cut  back  each  winter.    Drought  tolerant.    Adapt  to  most  well-drained  soils.    May  be  transplanted  year-round  if  cut  back  by  one  third.',
 'Plant Type': 'Shrub',
 'Plant name': 'Shrubby boneset, White mistflower, White shrub mistflower, Havana snakeroot',
 'Retention': 'Evergreen',
 'Scientific name': 'Ageratina havanensis',
 'Size': '3-6  ft.',
 'Soil Moisture': 'Dry',
 'Water Usage': 'NA',
 'Wildlife': 'Attracts  hummingbirds,  moths  &  butterflies'}

In [11]:
len(plantList)

155

### Create a Pandas DataFrame from the Plant List.

In [12]:
df = pd.DataFrame.from_dict(plantList)

In [13]:
df = df[headers]

In [14]:
df.head()

Unnamed: 0,Plant name,Scientific name,Family name,Plant Type,Retention,Light,Water Usage,Size,Color,Wildlife,Deer Resist,Maintenance,Soil Moisture,Drought Resist,Cold Resist,Heat Resist
0,"Shrubby boneset, White mistflower, White shrub...",Ageratina havanensis,Asteraceae (Aster Family),Shrub,Evergreen,Sun,,3-6 ft.,"White , Pink","Attracts hummingbirds, moths & butterflies",Moderate,Blooms best and appears bushier if sever...,Dry,,,
1,"Indigo bush, False indigo bush, False indigo, ...",Amorpha fruticosa,Fabaceae (Pea Family),Shrub,Deciduous,"Sun , Part Shade",Low Light,6-12 ft.,"Orange , Blue , Purple , Violet","Nectar-bees, Nectar-butterflies, Nectar-inse...",High,,Moist,,yes,
2,"Huisache Daisy, Butterfly Daisy, Honey Daisy",Amblyolepis setigera,Asteraceae (Aster Family),Herb,,Part Shade,Low Light,0-1 ft.,Yellow,"Nectar-Butterflies, Nectar-Bees, Nectar-inse...",,,Dry,,,
3,"Big bluestem, Tall bluestem, Turkeyfoot",Andropogon gerardii,Poaceae (Grass Family),Grass/Grass-like,,"Sun , Part Shade","Low , Medium Light","3-6 ft. , 6-12 ft.","Red , Blue , Brown",Provides cover for at least 24 species ...,High,,Moist,High Soil Description: Acid or calcareous ...,,
4,"Bushy bluestem, Brushy bluestem",Andropogon glomeratus,Poaceae (Grass Family),Grass/Grass-like,Deciduous,Sun,High Light,3-6 ft.,"White , Brown",Seeds eaten by granivorous birds and sma...,High,,Moist,,,yes


In [15]:
len(df)

155

# Data Cleaning 

After looking at the data, is it clear that the columns 'Size' and 'Water Usage' had come funky entries. Below I will apply a couple of functions that will clean up these columns.

In [16]:
df.head()

Unnamed: 0,Plant name,Scientific name,Family name,Plant Type,Retention,Light,Water Usage,Size,Color,Wildlife,Deer Resist,Maintenance,Soil Moisture,Drought Resist,Cold Resist,Heat Resist
0,"Shrubby boneset, White mistflower, White shrub...",Ageratina havanensis,Asteraceae (Aster Family),Shrub,Evergreen,Sun,,3-6 ft.,"White , Pink","Attracts hummingbirds, moths & butterflies",Moderate,Blooms best and appears bushier if sever...,Dry,,,
1,"Indigo bush, False indigo bush, False indigo, ...",Amorpha fruticosa,Fabaceae (Pea Family),Shrub,Deciduous,"Sun , Part Shade",Low Light,6-12 ft.,"Orange , Blue , Purple , Violet","Nectar-bees, Nectar-butterflies, Nectar-inse...",High,,Moist,,yes,
2,"Huisache Daisy, Butterfly Daisy, Honey Daisy",Amblyolepis setigera,Asteraceae (Aster Family),Herb,,Part Shade,Low Light,0-1 ft.,Yellow,"Nectar-Butterflies, Nectar-Bees, Nectar-inse...",,,Dry,,,
3,"Big bluestem, Tall bluestem, Turkeyfoot",Andropogon gerardii,Poaceae (Grass Family),Grass/Grass-like,,"Sun , Part Shade","Low , Medium Light","3-6 ft. , 6-12 ft.","Red , Blue , Brown",Provides cover for at least 24 species ...,High,,Moist,High Soil Description: Acid or calcareous ...,,
4,"Bushy bluestem, Brushy bluestem",Andropogon glomeratus,Poaceae (Grass Family),Grass/Grass-like,Deciduous,Sun,High Light,3-6 ft.,"White , Brown",Seeds eaten by granivorous birds and sma...,High,,Moist,,,yes


In [17]:
df1 = df

In [18]:
def CleanSizeColumn(x):
    if x[-3:] != 'ft.':
        return 'NA'
    return x

def CleanWaterUsageColumn(x):
    if x[-5:] == 'Light':
        return x[:-6]
    return x

In [19]:
df1['Size'] = df1['Size'].apply(CleanSizeColumn)
df1['Water Usage'] = df1['Water Usage'].apply(CleanWaterUsageColumn)

In [20]:
df1.head()

Unnamed: 0,Plant name,Scientific name,Family name,Plant Type,Retention,Light,Water Usage,Size,Color,Wildlife,Deer Resist,Maintenance,Soil Moisture,Drought Resist,Cold Resist,Heat Resist
0,"Shrubby boneset, White mistflower, White shrub...",Ageratina havanensis,Asteraceae (Aster Family),Shrub,Evergreen,Sun,,3-6 ft.,"White , Pink","Attracts hummingbirds, moths & butterflies",Moderate,Blooms best and appears bushier if sever...,Dry,,,
1,"Indigo bush, False indigo bush, False indigo, ...",Amorpha fruticosa,Fabaceae (Pea Family),Shrub,Deciduous,"Sun , Part Shade",Low,6-12 ft.,"Orange , Blue , Purple , Violet","Nectar-bees, Nectar-butterflies, Nectar-inse...",High,,Moist,,yes,
2,"Huisache Daisy, Butterfly Daisy, Honey Daisy",Amblyolepis setigera,Asteraceae (Aster Family),Herb,,Part Shade,Low,0-1 ft.,Yellow,"Nectar-Butterflies, Nectar-Bees, Nectar-inse...",,,Dry,,,
3,"Big bluestem, Tall bluestem, Turkeyfoot",Andropogon gerardii,Poaceae (Grass Family),Grass/Grass-like,,"Sun , Part Shade","Low , Medium","3-6 ft. , 6-12 ft.","Red , Blue , Brown",Provides cover for at least 24 species ...,High,,Moist,High Soil Description: Acid or calcareous ...,,
4,"Bushy bluestem, Brushy bluestem",Andropogon glomeratus,Poaceae (Grass Family),Grass/Grass-like,Deciduous,Sun,High,3-6 ft.,"White , Brown",Seeds eaten by granivorous birds and sma...,High,,Moist,,,yes


### Create a CSV from the Dataframe

In [22]:
df.to_csv('CentralTexasPlants.csv',encoding='utf-8', index=False)