In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re


In [14]:
df = pd.read_csv('all_plants.csv')

In [15]:
df

Unnamed: 0,Family,Genus,Species,CommonName,GrowthRate,HardinessZones,Height,Width,Type,Leaf,...,Soils,pH,Preferences,Tolerances,Habitat,HabitatRange,Edibility,Medicinal,OtherUses,PFAF
0,,,,,,,,,,,...,,,,,,,,,,
1,Aceraceae,Acer,['saccharinum'],,,"[5, 6, 7, 8, 9]",6,,deciduous,,...,"['light', 'medium', 'heavy', 'heavy']","mildly acid, neutral and basic (mildly alkaline)",[],[],Open ravines on shady aspects at altitudes bet...,E. Asia - W. Himalayas.,(1 of 5),(0 of 5),(2 of 5),https://pfaf.org/user/Plant.aspx?LatinName=Ace...
2,Aceraceae,Acer,['saccharum'],,,"[5, 6, 7, 8, 9]",6,,deciduous,,...,"['light', 'medium', 'heavy', 'heavy']","mildly acid, neutral and basic (mildly alkaline)",[],[],Open ravines on shady aspects at altitudes bet...,E. Asia - W. Himalayas.,(1 of 5),(0 of 5),(2 of 5),https://pfaf.org/user/Plant.aspx?LatinName=Ace...
3,Aceraceae,Acer,"['saccharum', 'var.', 'nigrum']",,,"[5, 6, 7, 8, 9]",Tree,,is,,...,"['light', 'medium', 'heavy', 'heavy']","mildly acid, neutral and basic (mildly alkaline)",[],[],Open ravines on shady aspects at altitudes bet...,E. Asia - W. Himalayas.,(1 of 5),(0 of 5),(2 of 5),https://pfaf.org/user/Plant.aspx?LatinName=Ace...
4,Araliaceae,Aralia,['\tnudicaulis'],"Chinese Angelica Tree, Pumila Spirea, Chinese ...",medium,"[4, 5, 6, 7, 8]",3.5,,deciduous,,...,"['light', 'medium', 'heavy']","mildly acid, neutral and basic (mildly alkaline)",[],[],Forests on rich well moistened soil[74].,E. Asia - China,(2 of 5),(2 of 5),(0 of 5),https://pfaf.org/user/Plant.aspx?LatinName=Ara...
5,Rhamnaceae,Ceanothus,['\tprostratus'],"New Jersey Tea, Wild Snowball",fast,"[4, 5, 6, 7, 8, 9]",1.2,1.0,deciduous,,...,"['light', 'medium']","mildly acid, neutral and basic (mildly alkaline)",[' well-drained soil'],[],"Dry woods and on gravelly banks[21, 43], often...","Eastern N. America - Maine to Florida, west to...",(3 of 5),(3 of 5),(3 of 5),https://pfaf.org/user/Plant.aspx?LatinName=Cea...


In [16]:
def get_plant_info(genus, species):
    # genus_species = input("Enter the genus and species: ")
    # genus = genus_species.split()[0]
    # species = genus_species.split()[1]
    pfaf_url = f"https://pfaf.org/user/Plant.aspx?LatinName={genus}+{species}"
    headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
    page = requests.get(pfaf_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get the description from which we will extract the characteristics
    # Description in string and list form
    description = soup.find("meta", id="description")['content']
    description_list = description.split()

    ##### Get the characteristics #####

    # deciduous or coniferous / shrub / groundcover etc
    type = description_list[4]

    if type in ["deciduous", "coniferous"]:
        height = description_list[8]
    else:
        height = description_list[7]

    try:
        growth_rate_idx = description_list.index("rate.") - 1 # growth rate is before "rate."
        growth_rate = description_list[growth_rate_idx]
    except ValueError:
        growth_rate = "" # sometimes no growth rate

    if "by" in description_list[:15]:
        width_idx = description_list.index("by") + 1 # width is after "by"
        width = description_list[width_idx]
    else:
        width = ""

    # When the plant leafs out
    leaf = "" # Better to do this before or after if statement?
    if "leaf" in description_list:
        leaf_idx = description_list.index("leaf")
        if description_list[leaf_idx + 1] == "from":
            leaf = " ".join(description_list[leaf_idx + 2: leaf_idx + 5]).strip(",")


    # When the plant is in flower
    flower = ""
    if "flower" in description_list:
        flower_idx = description_list.index("flower")
        if description_list[flower_idx + 1] == "in":
            flower = description_list[flower_idx + 2].strip(",")
        elif description_list[flower_idx + 1] == "from":
            flower = " ".join(description_list[flower_idx + 2: flower_idx + 5]).strip(",")
        else:
            print("Error: Flower not found")

    # When the plant's seed or fruit ripens
    ripen_date = ""
    if "ripen" in description_list:
        tmp_idx = description_list.index("ripen")
        if description_list[tmp_idx + 1] == "in":
            ripen_idx = tmp_idx + 2
            ripen_date = description_list[ripen_idx].strip(".")
        elif description_list[tmp_idx + 1] == "from":
            ripen_idx = tmp_idx + 2
            ripen_date = " ".join(description_list[ripen_idx: ripen_idx + 3]).strip(".")

    # Suitable soil structure (light, medium, heavy)
    soils = ""
    soil_idx = description.find("Suitable for:")
    soil_text = description[soil_idx:description.find(".", soil_idx)]
    soils = re.findall(r"(light|medium|heavy)", soil_text)

    # Suitable pH
    ph = ""
    if "pH:" in description_list:
        ph_idx = description_list.index("pH:") + 1
        ph_list = description_list[ph_idx:]
        # check for soil or soils in the list as end of ph
        if "soils." in ph_list:
            end_idx = ph_list.index("soils.")
        elif "soils" in ph_list:
            end_idx = ph_list.index("soils")
        elif "soil." in ph_list:
            end_idx = ph_list.index("soil.")
        elif "soil" in ph_list:
            end_idx = ph_list.index("soil")
        else:
            ph = "Error: end index not found"
            end_idx=6
    ph = " ".join(ph_list[:end_idx])

    # Monoecious or dioecious ("hermaprodite")
    reproduction = re.findall(r"The species is\s(\w+)", description)[0]

    # What the plant likes
    preferences = re.findall("prefers (.*?)\ and", description)

    # What the plant can tolerate
    tolerances = re.findall("can tolerate (.*?)\.", description)

    # Get information from the table
    table = soup.find("table",{"class":"table table-hover table-striped"})

    common_name = table.find("span", id="ContentPlaceHolder1_lblCommanName").text

    family = table.find("span", id="ContentPlaceHolder1_lblFamily").text

    # String range hardiness (with -) ex: 4-8
    hardiness_range = table.find("span", id="ContentPlaceHolder1_lblUSDAhardiness").text

    # List of hardiness zones
    hardiness_zones = []
    for zone in range(int(hardiness_range[0]), int(hardiness_range[-1]) + 1):
        hardiness_zones.append(zone)

    # Ecosystems
    habitats = table.find("span", id="ContentPlaceHolder1_txtHabitats").text

    # Native range
    habitat_range = table.find("span", id="ContentPlaceHolder1_lblRange").text

    edibility = table.find("span", id="ContentPlaceHolder1_txtEdrating").text.strip()

    other_uses = table.find("span", id="ContentPlaceHolder1_txtOtherUseRating").text.strip()

    medicinal_rating = table.find("span", id="ContentPlaceHolder1_txtMedRating").text.strip()

    print("")
    print(f'Common name: {common_name} \nFamily: {family} \nHardiness range: {hardiness_range} \
        \nMedicinal rating: {medicinal_rating} \nGrowth rate: {growth_rate} \nHeight: {height} meters \nType: {type} \
        \nLeaf: {leaf} \nFlower: {flower} \nRipen date: {ripen_date} \nSoils: {soils} \nSoil text: {soil_text} \npH: {ph}\
        \nReproduction: {reproduction} \nPreferences: {preferences} \nTolerances: {tolerances}\
        \nHabitats: {habitats} \nHabitat range: {habitat_range} \nEdibility: {edibility} \nOther uses: {other_uses} \
        \n \nDescription: {description}')


In [19]:
get_plant_info('Lonicera', 'villosa')


Common name: Mountain fly honeysuckle, Fuller's honeysuckle 
Family: Caprifoliaceae 
Hardiness range: 3-9         
Medicinal rating: (0 of 5) 
Growth rate: slow 
Height: 1.5 meters 
Type: deciduous         
Leaf:  
Flower: April to May. 
Ripen date:  
Soils: ['light', 'medium', 'heavy'] 
Soil text: Suitable for: light (sandy), medium (loamy) and heavy (clay) soils 
pH: mildly acid, neutral and basic (mildly alkaline)        
Reproduction: hermaphrodite 
Preferences: [] 
Tolerances: []        
Habitats: Peaty or rocky barrens, bogs and bushy places[43], especially on limestone. 
Habitat range: N. America - Newfoundland to Alaska, south to Pennsylvania, Wisconsin and California. 
Edibility: (3 of 5) 
Other uses: (0 of 5)         
 
Description: Lonicera villosa is a deciduous Shrub growing to 1.5 m (5ft) at a slow rate. It is in flower from April to May. The species is hermaphrodite (has both male and female organs) and is pollinated by Insects.  Suitable for: light (sandy), medium (loa