In [28]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests

COLUMNS = ["Family", "Genus", "Species", "CommonName", "GrowthRate", "HardinessZones",
           "Height", "Type", "Leaf", "Flower", "Ripen", "Reproduction", "Soils", "pH",
           "Preferences", "Tolerances", "Habitat", "HabitatRange", "Edibility",
           "Medicinal", "OtherUses"]

with open("all_plants.txt", "r") as f:
    data = f.readlines()
    for entry in data:
        data[data.index(entry)] = entry.rstrip("\n")
    # print(data)


def get_plant_info(genus, species):
    # genus_species = input("Enter the genus and species: ")
    # genus = genus_species.split()[0]
    # species = genus_species.split()[1]
    page = requests.get(f"https://pfaf.org/user/Plant.aspx?LatinName={genus}+{species}")
    soup = BeautifulSoup(page.content, 'html.parser')

    description = soup.find("meta", id="description")['content']
    description_list = description.split()

    ##### Get the characteristics #####

    # deciduous or coniferous / shrub / groundcover etc
    type = description_list[4]

    if type in ["deciduous", "coniferous"]:
        height = description_list[8]
    else:
        height = description_list[7]

    try:
        growth_rate_idx = description_list.index("rate.") - 1
        growth_rate = description_list[growth_rate_idx]
    except ValueError:
        growth_rate = ""

    leaf = ""
    if "leaf" in description_list:
        leaf_idx = description_list.index("leaf")
        if description_list[leaf_idx + 1] == "from":
            leaf = " ".join(description_list[leaf_idx + 2: leaf_idx + 5]).strip(",")


    flower = ""
    if "flower" in description_list:
        flower_idx = description_list.index("flower")
        if description_list[flower_idx + 1] == "in":
            flower = description_list[flower_idx + 2].strip(",")
        elif description_list[flower_idx + 1] == "from":
            flower = " ".join(description_list[flower_idx + 2: flower_idx + 5]).strip(",")
        else:
            print("Error: Flower not found")

    ripen_date = ""
    if "ripen" in description_list:
        tmp_idx = description_list.index("ripen")
        if description_list[tmp_idx + 1] == "in":
            ripen_idx = tmp_idx + 2
            ripen_date = description_list[ripen_idx].strip(".")
        elif description_list[tmp_idx + 1] == "from":
            ripen_idx = tmp_idx + 2
            ripen_date = " ".join(description_list[ripen_idx: ripen_idx + 3]).strip(".")

    soils = ""
    soil_idx = description.find("Suitable for:")
    soil_text = description[soil_idx:description.find(".", soil_idx)]
    soils = re.findall(r"(light|medium|heavy)", soil_text)

    ph = ""
    if "pH:" in description_list:
        ph_idx = description_list.index("pH:") + 1
        ph_list = description_list[ph_idx:]
        # check for soil or soils in the list as end of ph
        if "soils." in ph_list:
            end_idx = ph_list.index("soils.")
        elif "soils" in ph_list:
            end_idx = ph_list.index("soils")
        elif "soil." in ph_list:
            end_idx = ph_list.index("soil.")
        elif "soil" in ph_list:
            end_idx = ph_list.index("soil")
        else:
            ph = "Error: end index not found"
            end_idx=6
    ph = " ".join(ph_list[:end_idx])

    reproduction = re.findall(r"The species is\s(\w+)", description)[0]

    preferences = re.findall("prefers (.*?)\ and", description)

    tolerances = re.findall("can tolerate (.*?)\.", description)

    # Get information from the table
    table = soup.find("table",{"class":"table table-hover table-striped"})

    common_name = table.find("span", id="ContentPlaceHolder1_lblCommanName").text

    family = table.find("span", id="ContentPlaceHolder1_lblFamily").text

    # String range hardiness (with -) ex: 4-8
    hardiness_range = table.find("span", id="ContentPlaceHolder1_lblUSDAhardiness").text

    # List of hardiness zones
    hardiness_zones = []
    for zone in range(int(hardiness_range[0]), int(hardiness_range[-1]) + 1):
        hardiness_zones.append(zone)

    habitats = table.find("span", id="ContentPlaceHolder1_txtHabitats").text

    habitat_range = table.find("span", id="ContentPlaceHolder1_lblRange").text

    edibility = table.find("span", id="ContentPlaceHolder1_txtEdrating").text.strip()

    other_uses = table.find("span", id="ContentPlaceHolder1_txtOtherUseRating").text.strip()

    medicinal_rating = table.find("span", id="ContentPlaceHolder1_txtMedRating").text.strip()

    # print(table.prettify())

    # Add Line Space
#     print("")
    print(f'Common name: {common_name} \nFamily: {family} \nHardiness range: {hardiness_range} \
        \nMedicinal rating: {medicinal_rating} \nGrowth rate: {growth_rate} \nHeight: {height} meters \nType: {type} \
        \nLeaf: {leaf} \nFlower: {flower} \nRipen date: {ripen_date} \nSoils: {soils} \nSoil text: {soil_text} \npH: {ph}\
        \nReproduction: {reproduction} \nPreferences: {preferences} \nTolerances: {tolerances}\
        \nHabitats: {habitats} \nHabitat range: {habitat_range} \nEdibility: {edibility} \nOther uses: {other_uses} \
        \n \nDescription: {description}')

    return family, genus, species, common_name, growth_rate, hardiness_zones, height, \
            type, leaf, flower, ripen_date, reproduction, soils, ph, preferences, \
            tolerances, habitats, habitat_range, edibility, medicinal_rating, other_uses


def create_df():
    pd.set_option('display.max_columns', None)
    df = pd.DataFrame(index=COLUMNS)
    return df



In [25]:
df = create_df()

for plant in data:
    genus, species = plant.split(" ")
    try:
        df = pd.concat([df, pd.Series(get_plant_info(genus, species), index=COLUMNS)], axis=1)
    except Exception as e:
        print(f"Error for {genus}, {species}: {e}")



Error for Corylus, avellana: 'NoneType' object is not subscriptable
Error for Cornus, mas: 'NoneType' object is not subscriptable
Error for Sheperdia, argentea: 'NoneType' object is not subscriptable
Error for Gaylussucia, brachycera: 'NoneType' object is not subscriptable


In [26]:
df.T

Unnamed: 0,Family,Genus,Species,CommonName,GrowthRate,HardinessZones,Height,Type,Leaf,Flower,Ripen,Reproduction,Soils,pH,Preferences,Tolerances,Habitat,HabitatRange,Edibility,Medicinal,OtherUses
0,Annonaceae,Asimina,triloba,Papaw,slow,"[5, 6, 7, 8]",4.5,deciduous,,May to June,October,hermaphrodite,[medium],"mildly acid, neutral and basic (mildly alkaline)","[ well-drained soil. Suitable pH: mildly acid,...",[],"An understorey tree of woodlands, growing in d...",South-eastern N. America - New Jersey to Flori...,(4 of 5),(2 of 5),(3 of 5)
0,Fagaceae,Castanea,dentata,American Sweet Chestnut,,"[4, 5, 6, 7, 8]",30,deciduous,,July,October,monoecious,"[light, medium, heavy]",mildly acid and neutral soils and can grow in ...,"[ well-drained soil, dry or moist soil]",[drought],"Dry, gravelly or rocky, mostly acid soils[43]....",Eastern N. America - Maine and Ontario to Mich...,(3 of 5),(1 of 5),(3 of 5)
0,Fagaceae,Castanea,sativa,"Sweet Chestnut, European chestnut",medium,"[5, 6, 7]",30,deciduous,,July,October,monoecious,"[light, medium, heavy]",mildly acid and neutral soils and can grow in ...,"[ well-drained soil, dry or moist soil]","[drought, maritime exposure]",Woods in mountains[100].,S. Europe. Long naturalized in Britain[17].,(5 of 5),(2 of 5),(5 of 5)
0,Rosaceae,Pyrus,pyrifolia,"Sand Pear, Chinese pear",,"[5, 6, 7, 8, 9]",10,deciduous,,April,September,hermaphrodite,"[light, medium, heavy, heavy]","mildly acid, neutral and basic (mildly alkaline)","[ well-drained soil, moist soil]","[drought, atmospheric pollution]",Warm rainy regions at elevations of 100 - 1400...,E. Asia - China.,(4 of 5),(1 of 5),(0 of 5)
0,Grossulariaceae,Ribes,aureum,Golden Currant,,"[3, 4, 5, 6, 7, 8]",2.4,deciduous,,April,July to August,hermaphrodite,"[light, medium, heavy]","mildly acid, neutral and basic (mildly alkaline)","[ well-drained soil. Suitable pH: mildly acid,...",[],"By streams, in ravines and on mountain slopes[...",Western N. America - Saskatchewan to Washingto...,(4 of 5),(1 of 5),(0 of 5)
0,Ericaceae,Vaccinium,corymbosum,"High-Bush Blueberry, American Blueberry, Swamp...",medium,"[3, 4, 5, 6, 7, 8]",2,deciduous,,May to June,August,hermaphrodite,"[light, medium]",mildly acid soils and can grow in very acid,[ well-drained soil. Suitable pH: mildly acid ...,[],"Swamps, low wet woods, pine barrens and dry up...",Eastern N. America - Nova Scotia to Quebec and...,(4 of 5),(1 of 5),(0 of 5)
0,Grossulariaceae,Ribes,hirtellum,"Currant-Gooseberry, Hairystem gooseberry",,"[4, 5, 6, 7, 8]",1,deciduous,,April.,,hermaphrodite,"[light, medium, heavy]","mildly acid, neutral and basic (mildly alkaline)","[ well-drained soil. Suitable pH: mildly acid,...",[],Rocky or swampy woods and clearings[43].,Northern N. America - Newfoundland to Manitoba...,(3 of 5),(0 of 5),(0 of 5)
0,Caprifoliaceae,Lonicera,caerulea,"Sweetberry honeysuckle, Bluefly honeysuckle, H...",fast,"[3, 4, 5, 6, 7, 8, 9]",2,deciduous,,April to May.,,hermaphrodite,"[light, medium, heavy]","mildly acid, neutral and basic (mildly alkaline)",[],[],Low ground[235].,N.E. Europe. Northern N. America - Newfoundlan...,(4 of 5),(0 of 5),(0 of 5)
0,Pinaceae,Pinus,sylvestris,"Scot's Pine, Scotch Pine",fast,"[3, 4, 5, 6, 7]",to,evergreen,,May,March to June,monoecious,"[light, medium]","mildly acid, neutral and basic (mildly alkalin...","[ well-drained soil, dry moist or wet soil]","[drought, maritime exposure, atmospheric pollu...","Forming woods in the mountains of Scotland[7, 9].","Europe, including Britain, from Scandanavia so...",(2 of 5),(3 of 5),(5 of 5)
0,Fabaceae or Leguminosae,Caragana,arborescens,"Siberian Pea Tree, Siberian peashrub",fast,"[2, 3, 4, 5, 6, 7]",6,deciduous,,May,September,hermaphrodite,"[light, medium]","mildly acid, neutral and basic (mildly alkalin...","[ well-drained soil, dry or moist soil]",[drought],"River banks, pebbles, sands, open forests and ...",E. Asia - Siberia to Mongolia. Occasionally na...,(5 of 5),(1 of 5),(4 of 5)


In [36]:
df = pd.read_csv('plants.csv')

In [39]:
tmp = pd.Series(get_plant_info('Corylus', 'avellana'), index=COLUMNS).transpose()

Common name: Common Hazel, Common filbert, European Filbert, Harry Lauder's Walking Stick, Corkscrew Hazel, Hazel 
Family: Betulaceae 
Hardiness range: 4-8         
Medicinal rating: (2 of 5) 
Growth rate: medium 
Height: 6 meters 
Type: deciduous         
Leaf:  
Flower: January to April 
Ripen date: September to October 
Soils: ['light', 'medium', 'heavy'] 
Soil text: Suitable for: light (sandy), medium (loamy) and heavy (clay) soils 
pH: mildly acid, neutral and basic (mildly alkaline) soils and can grow in very acid and very alkaline        
Reproduction: monoecious 
Preferences: [] 
Tolerances: []        
Habitats: Woods and hedgerows, especially on the slopes of hills, often on calcareous soils[7, 17]. 
Habitat range: Europe, including Britain, from Norway to Spain and east to W. Asia. 
Edibility: (5 of 5) 
Other uses: (5 of 5)         
 
Description: Corylus avellana is a deciduous Tree growing to 6 m (19ft) by 3 m (9ft) at a medium rate.  See above for USDA hardiness. It is har

In [44]:
tmp.shape

(21,)