In [65]:
# importing the required libraries

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [66]:
# using the requests library to open the html page

ua = 'NewUserAgent'
page = requests.get("https://world-crops.com/showcase/scientific-names", headers={"User-Agent": ua})

In [68]:
# the "Scientific Names" page contains the links of all crops in the database under the html tag of "class entry-content clear"
# we can create an instance of beautiful soup and parse the page; create a python list of links to all crops
# the list is saved under a variable - crop_list

soup = BeautifulSoup(page.content, 'html.parser')
mytag = soup.find(class_="entry-content clear")
crop_list = []
for link in mytag.find_all('a'):
    crop_list.append('https://world-crops.com/showcase/scientific-names/'+link['href'])

In [134]:
# make a list of all the crop pages that we want to parse

crop_page = []
for i in range(0,650):
    my_crop = requests.get(crop_list[i], headers={"User-Agent": ua})
    crop_page.append(BeautifulSoup(my_crop.content, 'html.parser'))

In [135]:
# extracting the generic name of the crop. It is always in the first h1 tag of the webpage.
generic_name = ['']*len(crop_page)

for i in range(0,len(crop_page)):
    generic_name[i] = crop_page[i].find("h1", {"class": "entry-title"}).get_text()

In [136]:
# extracting the scientific name; and names in other languages. 


stoppoint = ['Scientific','Synonym','English','Dutch','Spanish','French','German','Italian']
names_cols = {name_col: [] for name_col in stoppoint}

for i in range(0, len(crop_page)):
    try:
        names = list(crop_page[i].find("h2", text = 'Names').next_sibling.stripped_strings)
        
    except:
        names = ' '
            
    key = names[0]
    names_dict = {}
    values = []
    for each in names[1:]:
        if each in stoppoint:
            names_dict[key] = values
            key = each
            values = []
        else:
            values.append(each)
    else:
        names_dict[key] = values
        
    for each in stoppoint:
        names_cols[each].append(names_dict.get(each, []))


In [137]:
names_df = pd.DataFrame(names_cols)

names_df['Count English'] = 0
names_df['Count Dutch'] = 0
names_df['Count Spanish'] = 0
names_df['Count French'] = 0
names_df['Count German'] = 0
names_df['Count Italian'] = 0

for i in range(names_df.shape[0]):
    names_df.at[i,"Count English"] = len(names_df.at[i,'English'])
    names_df.at[i,'English']=','.join(map(str,names_df.at[i,'English']))

    names_df.at[i,"Count Dutch"] = len(names_df.at[i,'Dutch'])
    names_df.at[i,'Dutch']=','.join(map(str,names_df.at[i,'Dutch']))

    names_df.at[i,"Count Spanish"] = len(names_df.at[i,'Spanish'])
    names_df.at[i,'Spanish']=','.join(map(str,names_df.at[i,'Spanish']))

    names_df.at[i,"Count French"] = len(names_df.at[i,'French'])
    names_df.at[i,'French']=','.join(map(str,names_df.at[i,'French']))

    names_df.at[i,"Count German"] = len(names_df.at[i,'German'])
    names_df.at[i,'German']=','.join(map(str,names_df.at[i,'German']))

    names_df.at[i,"Count Italian"] = len(names_df.at[i,'Italian'])
    names_df.at[i,'Italian']=','.join(map(str,names_df.at[i,'Italian']))

In [138]:
# extracting the taxonomy. It is always the second h2 tag in the webpage. 
# The actual taxonomy details are in the tag right after it

genus, family, order = ['']*len(crop_page), ['']*len(crop_page), ['']*len(crop_page)


for i in range(0,len(crop_page)):
    try:
        taxonomy = crop_page[i].find("div", {"class": "entry-content clear"}).find_all("h2")[1].next_sibling
        genus[i] = taxonomy.findChildren()[2].get_text()
        family[i] = taxonomy.findChildren()[6].get_text()
        order[i] = taxonomy.findChildren()[10].get_text()
    except:
        #do nothing
        genus[i] = ''

In [139]:
# extracting the crop category. Crop category is not always in the same position. 
# we can use find with text to locate the crop category tag

cat_count = [0]*len(crop_page) 
cat_list = ['']*len(crop_page)

for i in range(0,len(crop_page)):
    cat_list[i] = []
    try:
        crop_cat = crop_page[i].find("h2", text = 'Crop categories').next_sibling()
        # and since we can have more than one crop category, we save it in a list
        for ele in crop_cat:
            cat_list[i].append(ele.get_text())
        cat_list[i] = list(filter(None, cat_list[i]))
    except:
        cat_list[i].append('')

i=0
for ele in cat_list:
    cat_list[i] = ','.join(map(str,ele))
    cat_count[i] = len(ele)
    i +=1


In [140]:
# defining a function to extract the basic information

def extract_basic_info(item_name, crop_page):
    try:
        if not crop_page.find("h4", text = item_name).next_sibling.name == 'p':
            item = ''
        else:
            item = crop_page.find("h4", text = item_name).next_sibling.get_text()
            
    
    except:
        item = ''
    return(item)

In [141]:
# using the extract_basic_info function to scrape the Basic Information Section of the Webpage 

basic_info_columns = [  'Origin','Distribution','Annual, biennial, or perennial',
                        'Flowers','Leaves','Fruits','Climate and weather',
                        'Pollination','Height','Type of soil','Moisture','Light',
                        'Growth rate','Spacing (close range)','Spacing (wide range)',
                        'Propagation','Insect pests','Diseases','Nematodes',
                        'Harvesting','Uses', 'Evergreen or Deciduous','Plant','Fruit development' ,'Blooming period' ]

basic_info_dict  = {}

for i in range(0,len(crop_page)):
    j=0
    for k in basic_info_columns:
        if k not in ['Spacing (close range)','Spacing (wide range)']:
            basic_info_dict.setdefault(k,[]).append(extract_basic_info(basic_info_columns[j]+':',crop_page[i]))
        else:
            basic_info_dict.setdefault(k,[]).append(extract_basic_info(basic_info_columns[j],crop_page[i]))
        j+=1
        
basic_info_df = pd.DataFrame(basic_info_dict)

In [142]:
# extracting proverbs data. used similar logic as generic names section.

proverbs = []

for i in range(0,len(crop_page)):
    try:    
        proverbs.append(crop_page[i].find("h2", text = 'Proverbs and Quotes').next_sibling.get_text())
    except:
        proverbs.append('')

In [143]:
# extracting recipies data. some cases have multiple recipies. 
# hence extracting all the text available until we hit the next h2 tag in the the webpage.

recipies = []

for i in range(0,len(crop_page)):
    this_recipie = ''
    try:    
        next_tag = crop_page[i].find("h2", text = 'Recipes').next_sibling
        while not next_tag.name =='h2':
            this_recipie = this_recipie + next_tag.get_text()
            next_tag = next_tag.next_sibling
        recipies.append(this_recipie)
    except:
        recipies.append('')

In [144]:
# all the required data has been extracted in column format.
# we can define a dataframe and add the columns to it. 

df = names_df.join(basic_info_df)

df['genus'] = genus
df['family'] = family
df['order'] = order

df['crop_categories'] = cat_list
df['Count of Crop Categories'] = cat_count

df['proverbs'] = proverbs
df['recipies'] =recipies 



In [145]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 150

df

Unnamed: 0,Scientific,Synonym,English,Dutch,Spanish,French,German,Italian,Count English,Count Dutch,Count Spanish,Count French,Count German,Count Italian,Origin,Distribution,"Annual, biennial, or perennial",Flowers,Leaves,Fruits,Climate and weather,Pollination,Height,Type of soil,Moisture,Light,Growth rate,Spacing (close range),Spacing (wide range),Propagation,Insect pests,Diseases,Nematodes,Harvesting,Uses,Evergreen or Deciduous,Plant,Fruit development,Blooming period,genus,family,order,crop_categories,Count of Crop Categories,proverbs,recipies
0,[Siraitia grosvenorii],[Momordica grosvenorii],"Buddha fruit,Luo han guo,Monk fruit,Monk’s fruit",,,Luo han,"Luo han guo,Other,Lohoguo,Luo han guo,Luo han kuo",,4,0,0,1,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,Siraitia,Cucurbitaceae,Cucurbitales,"Fruits,Medicinal plants,Food crops,Tropical crops",4,,
1,[Smallanthus sonchifolius],[Polymnia edulis],"Peruvian ground apple,Yacón","Grond-appel,Yacon",Yacón,"Poire de terre,Yacón",Yacón,,2,2,1,2,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,Smallanthus,Asteraceae,Asterales,"Vegetables,Tuber vegetables,Food crops,Tropica...",4,,
2,[Smyrnium olusatrum],[],"Alexanders,Alisanders,Horse parsley,Smyrnium",Zwartmoeskervel,,Maceron,"Alisander,Gespenst-Gelbdolde,Pferdeeppich","Corinoli comune,Macerone",4,1,0,1,3,2,,,,,,,,,,,,,,,,,,,,,,,,,,Smyrnium,Apiaceae,Apiales,"Herbs,Temperate crops",2,,
3,[Solanum aethiopicum],[],"Ethiopian eggplant,Ethiopian nightshade,Garden...",,"Falso tomate,Nakati etíope","Aubergine africaine,Aubergine amère,Petite bri...",Äthiopische Eierfrucht,Melanzana rossa,5,0,2,4,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,Solanum,Solanaceae,Solanales,"Fruits,Vegetables,Leaf vegetables,Fruit vegeta...",6,,
4,[Solanum betaceum],[Cyphomandra betacea],"Tamarillo,Tree tomato","Boomtomaat,Tamarillo",Tomate de árbol,"Tamarillo,Tomate en arbre","Baumtomate,Tamarillo","Albero dei pomodori,Pomodoro arboreo,Tamarillo",2,2,1,2,2,3,,,,,,,,,,,,,,,,,,,,,,,,,,Solanum,Solanaceae,Solanales,"Fruits,Food crops,Tropical crops,Subtropical c...",4,,
5,[Solanum centrale],[],"Australian desert raisin,Kutjera",,,,,,2,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,Solanum,Solanaceae,Solanales,"Fruits,Food crops,Subtropical crops",3,,
6,[Solanum lycopersicum],[Lycopersicon esculentum],Tomato,Tomaat,Tomate,Tomate,Tomate,"Pomodoro,Other,Timatim",1,1,1,1,1,3,Central and South America (area between Peru a...,Now grown in all continents and in a wide rang...,"Perennial, but often grown as an annual.",Yellow flowers of 1 to 2 cm diameter grow toge...,Stem and leaves are densely hairy. The compoun...,The tomato fruit is a berry. Inside the fruit ...,Tomatoes are grown in a wide range of climates...,"Self fertilization is common, but also cross p...","The plants can reach a height of 1 to 3 meter,...",,,,,,,Usually propagated from seed. Seeds should be ...,A wide variety of insects can attack tomato in...,"Different fungal, bacterial, and virus disease...",,Picking when the fruits are ripe or almost rip...,Eaten fresh (in salads) or cooked in a variety...,,,,,Solanum,Solanaceae,Solanales,"Vegetables,Fruit vegetables,Food crops,Tropica...",6,It is the fool whose own tomatoes are sold to ...,Tomato – Mozzarella saladIngredients 500 gram...
7,[Solanum macrocarpon],[],"African eggplant,Gboma","Afrikaanse aubergine,Antroewa",,Aubergine d’Afrique,Afrikanische Eierpflanze,Melanzana petonciano,2,2,0,1,1,1,Africa,"Central and West Africa, the Caribbean, South ...",It is a perennial plant.,The purple or pale purple flowers are 2 to 3.5...,The oval and lobed leaves have a wavy margin. ...,The round fruits are partly covered by the cal...,Tropical,,The African eggplant grows about 1 to 1.5 mete...,,,,,,,,,,,,Fruits are used as a vegetable. They have a bi...,,,,,Solanum,Solanaceae,Solanales,"Vegetables,Fruit vegetables,Food crops,Tropica...",5,,
8,[Solanum melongena],[],"Brinjal,Egg-plant,Eggplant","Aubergine,Eierplant,Eiervrucht,Melanzaanappel",Berenjena,"Aubergine,Aubergine brinjal,Aubergine douce",,Melanzana,3,4,1,3,0,1,India and Sri Lanka,"All tropical and subtropical regions, but main...",Perennial but often grown as an annual.,Flowers have white or purple color.,Eggplant leaves are lobed. Leaves are 10 to 20...,In wild eggplants the fruit is rather small (s...,Eggplant requires a tropical or subtropical cl...,?,Eggplants are usually from 40 to 150 cm tall.,,,,,,,Propagation:\tEggplants are usually grown from...,A major pest of eggplant is the fruit and shoo...,Verticillium and some other fungal diseases.,,Ripe fruits are usually cut from the plant wit...,"Fruits are cooked, fried, baked, roasted, or u...",,,,,Solanum,Solanaceae,Solanales,"Vegetables,Fruit vegetables,Food crops,Tropica...",5,,
9,[Solanum muricatum],[],"Melon pear,Pepino,Pepino melon,Sweet pepino,Tr...","Appelmeloen,Meloenpeer,Pepino","Melón de árbol,Pepino de fruta,Pepino dulce,Pe...","Morelle de Wallis,Pepino,Poire-melon","Melonenbirne,Pepino","Caciuma,Melone-pepino,Pera-melone",5,3,5,3,2,3,,,,,,,,,,,,,,,,,,,,,,,,,,Solanum,Solanaceae,Solanales,"Fruits,Food crops,Subtropical crops",3,,


In [146]:
df.to_csv("Crop_Database_Final_Output.csv")