# Scrape

In [233]:
from bs4 import BeautifulSoup as bs
import requests
from pprint import pprint
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from splinter import Browser
import pymongo

In [52]:
#find links to scrape
url='https://oceana.org/marine-life'
base_url='https://oceana.org'

response=requests.get(url)
soup=bs(response.content, 'lxml')

menu=soup.find('div', class_='outter-contain-limit')
links=menu.find_all('a', href=True)

urls=[]
for link in links:
    urls.append(base_url+link['href'])

#remove marine science and ecosystems link    
urls.remove('https://oceana.org/marine-life/marine-science-and-ecosystems')
urls

['https://oceana.org/marine-life/cephalopods-crustaceans-other-shellfish',
 'https://oceana.org/marine-life/corals-and-other-invertebrates',
 'https://oceana.org/marine-life/marine-mammals',
 'https://oceana.org/marine-life/ocean-fishes',
 'https://oceana.org/marine-life/sea-turtles-reptiles',
 'https://oceana.org/marine-life/seabirds',
 'https://oceana.org/marine-life/sharks-rays']

In [53]:
#scrape all urls for species links
urls_to_scrape=[]
for url in urls:
    response=requests.get(url)
    soup=bs(response.content, 'lxml')

    menu=soup.find('main', class_='animal-contain')
    links=menu.find_all('a', href=True)

    for link in links:
        urls_to_scrape.append(base_url+link['href'])

In [54]:
pre_df=[]
for url in tqdm(urls_to_scrape):
    try:
        response=requests.get(url)
        soup=bs(response.content, 'lxml')
        header=soup.find('div', class_='subpage-header-inner')
        category=header.h2.text
        common_name=header.h1.text
        scientific_name=header.find('p', class_='subname').text
        details=soup.find('div', class_='animal-details-side')
        facts=details.find_all('p')
        distribution=facts[0].text.strip()
        taxonomy=facts[4].text.strip()
        image=soup.find('div', class_='animal-image-contain').img['src']

    except:
        pass
    
    details={'common_name': common_name,
             'scientific_name': scientific_name,
             'taxonomy': taxonomy,
             'category': category,
             'distribution': distribution,
             'images': [image],
             'url': url}
    
    pre_df.append(details)

HBox(children=(IntProgress(value=0, max=228), HTML(value='')))




In [55]:
oceana_fact_df=pd.DataFrame(pre_df)
oceana_fact_df.head()
oceana_fact_df.to_csv('oceana.csv', index=False)

In [56]:
#setting up path to chrome driver for splinter
executable_path={'executable_path':'chromedriver.exe'}

In [57]:
#gather urls to scrape
browser = Browser('chrome', **executable_path, headless=False)
url_1='https://reefguide.org/index1.html'
base_url='https://reefguide.org/'
browser.visit(url_1)

#set up parser
html = browser.html
soup = bs(html, 'lxml')

## Expanding menu to get all links
menu=soup.find_all('div', class_='ui-accordion-content')
urls_to_scrape=[]
for item in menu:
    try:
        links=item.find_all('a', href=True)
        for i in links:
            url=base_url+i['href']
            urls_to_scrape.append(url)
    except:
        pass
    
#add original url
urls_to_scrape.append(url_1)

#close browser
browser.quit()

urls_to_scrape

['https://reefguide.org/index50.html',
 'https://reefguide.org/index3.html',
 'https://reefguide.org/index51.html',
 'https://reefguide.org/index2.html',
 'https://reefguide.org/index4.html',
 'https://reefguide.org/index5.html',
 'https://reefguide.org/index6.html',
 'https://reefguide.org/index7.html',
 'https://reefguide.org/index8.html',
 'https://reefguide.org/index9.html',
 'https://reefguide.org/index58.html',
 'https://reefguide.org/index59.html',
 'https://reefguide.org/index10.html',
 'https://reefguide.org/index11.html',
 'https://reefguide.org/index12.html',
 'https://reefguide.org/index13.html',
 'https://reefguide.org/index52.html',
 'https://reefguide.org/index55.html',
 'https://reefguide.org/index14.html',
 'https://reefguide.org/index15.html',
 'https://reefguide.org/index57.html',
 'https://reefguide.org/index16.html',
 'https://reefguide.org/index17.html',
 'https://reefguide.org/index54.html',
 'https://reefguide.org/index18.html',
 'https://reefguide.org/index19.h

In [58]:
#Perform scrape of to Common name, Scientific Name, URL link
df_list=[]
for i, url in enumerate(urls_to_scrape):
    response=requests.get(url)
    soup = bs(response.content, 'lxml')
    
    table=soup.find('table', id='TopTable')
    hrefs=table.find_all('a', href=True)
    images=table.find_all('img')
    
    base_url='https://reefguide.org/'
    
    urls=[]
    common_names=[]
    scientific_names=[]
    names=[]
    
    for item in hrefs:
        urls.append(base_url+item['href'])
        names.append(item.text)
    for item in images:
        common_names.append(item['alt'].split(' - ')[0])
        scientific_names.append(item['alt'].split(' - ')[1])
    
    df_1=pd.DataFrame({'name':names,
                      'url': urls})
    df_2=pd.DataFrame({'common_name': common_names,
                       'scientific_name': scientific_names})
    
    #need to remove certain urls that do not line up with link to individual species
    exec("df{}=pd.merge(df_2, df_1, left_on='common_name', right_on='name', how='inner')".format(i))
    exec('df_list.append(df{})'.format(i))

In [59]:
#concatenating all fish dfs gathered
reef=pd.concat(df_list)
reef.shape

(1737, 4)

In [60]:
# finding duplicates (species that share common_name) the merge above would have caused multiple copies (some incorrect)
test=reef.groupby('common_name')
x=test.filter(lambda x: x['common_name'].count() > 1.)
x

Unnamed: 0,common_name,scientific_name,name,url
37,Sunset Wrasse,Thalassoma lutescens,Sunset Wrasse,https://reefguide.org/sunsetwrasse.html
38,Sunset Wrasse,Thalassoma lutescens,Sunset Wrasse,https://reefguide.org/thalassomagrammaticum.html
39,Sunset Wrasse,Thalassoma grammaticum,Sunset Wrasse,https://reefguide.org/sunsetwrasse.html
40,Sunset Wrasse,Thalassoma grammaticum,Sunset Wrasse,https://reefguide.org/thalassomagrammaticum.html


In [61]:
#dropping duplicates (to account for 2 species sharing the same common name 'Sunset Wrasse')
reef.drop_duplicates(subset=['common_name', 'scientific_name'], keep='last', inplace=True)

In [62]:
#fixing the url 'https://reefguide.org/sunsetwrasse.html' for Sunset Wrasse
reef['url'][(reef["common_name"] =='Sunset Wrasse') & (reef["scientific_name"] == 'Thalassoma lutescens')]='https://reefguide.org/sunsetwrasse.html'
reef.loc[reef['common_name']=='Sunset Wrasse']

Unnamed: 0,common_name,scientific_name,name,url
38,Sunset Wrasse,Thalassoma lutescens,Sunset Wrasse,https://reefguide.org/sunsetwrasse.html
40,Sunset Wrasse,Thalassoma grammaticum,Sunset Wrasse,https://reefguide.org/thalassomagrammaticum.html


In [64]:
#confirming the count
reef.shape

(1735, 4)

In [65]:
#removing redundant columns
reef = reef.drop("name", axis=1)
reef.head()

Unnamed: 0,common_name,scientific_name,url
0,Moorish Idol,Zanclus cornutus,https://reefguide.org/moorishidol.html
1,Longsnout Butterflyfish,Prognathodes aculeatus,https://reefguide.org/longsnoutbutter.html
2,Orange-Banded Coralfish,Coradion chrysozonus,https://reefguide.org/coradionchrysozonus.html
3,Two-Eyed Coralfish,Coradion melanopus,https://reefguide.org/coradionmelanopus.html
4,Humphead Bannerfish,Heniochus varius,https://reefguide.org/humpheadbannerfish.html


In [66]:
#gathering individual urls to scrape
reef_urls=reef['url']

In [67]:
#scraping urls for pictures and facts
pre_df=[]
base_url='https://reefguide.org/'

for url in tqdm(reef_urls):
    response=requests.get(url)
    soup = bs(response.content, 'lxml')
    
    title=soup.find('div', class_='titledetails').text
    
    menu=soup.find_all('div', class_='infodetails')
    
    try:
        scientific_name=menu[0].text.split(': ')[1]
        family=menu[2].text.split(': ')[1]
        category=menu[4].text.split(': ')[1]
        size=(menu[6].text.split(': ')[1]).replace(u'\xa0', u'')
        depth=menu[7].text.split(': ')[1]
        distribution=menu[8].text.split(': ')[1]
    except:
        pass

    details={'common_name': title,
             'scientific_name': scientific_name,
             'family': family,
             'category': category,
             'size': size,
             'depth': depth,
             'distribution': distribution}
 
    images=soup.find('div', id='gallery')
    image_details=images.find_all('img')
    image_list=[]


    for image in image_details:
        image_list.append(base_url+image['src'])
    
    details['images']=image_list
             
    pre_df.append(details)

HBox(children=(IntProgress(value=0, max=1735), HTML(value='')))




In [68]:
reef_fact_df=pd.DataFrame(pre_df)
reef_fact_df.head()

Unnamed: 0,category,common_name,depth,distribution,family,images,scientific_name,size
0,Zanclidae,Moorish Idol,10-300 ft. (3-90 m),"Indo-Pacific, Hawaii, Gulf of California, Paci...",Zanclidae,[https://reefguide.org/pix/thumb2/moorishidol4...,Zanclus cornutus,5 to 8 in. (13 to 20 cm)
1,Butterflyfishes,Longsnout Butterflyfish,30-200 ft. (10-60 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Chaetodontidae,[https://reefguide.org/pix/thumb2/longsnoutbut...,Prognathodes aculeatus,2 to 3 in. (5 to 8 cm)
2,Butterflyfishes,Orange-Banded Coralfish,10-180 ft. (3-55 m),"Indo-West Pacific, Great Barrier Reef",Chaetodontidae,[https://reefguide.org/pix/thumb2/coradionchry...,Coradion chrysozonus,Up to 6 in. (15 cm)
3,Butterflyfishes,Two-Eyed Coralfish,30-100 ft. (10-30 m),Asian Pacific,Chaetodontidae,[https://reefguide.org/pix/coradionmelanopus1....,Coradion melanopus,Up to 6 in. (15 cm)
4,Butterflyfishes,Humphead Bannerfish,6-100 ft. (2-30 m),"West Pacific, Indonesia, French Polynesia",Chaetodontidae,[https://reefguide.org/pix/thumb2/humpheadbann...,Heniochus varius,5 to 8 in. (13 to 20 cm)


In [69]:
#merge reef_fact_df with reef df
reef_df=pd.merge(reef, reef_fact_df, left_on=['scientific_name', 'common_name'], right_on=['scientific_name', 'common_name'], how='inner')
#export to csv
reef_df.to_csv('reef.csv', index=False)

reef_df.head()

Unnamed: 0,common_name,scientific_name,url,category,depth,distribution,family,images,size
0,Moorish Idol,Zanclus cornutus,https://reefguide.org/moorishidol.html,Zanclidae,10-300 ft. (3-90 m),"Indo-Pacific, Hawaii, Gulf of California, Paci...",Zanclidae,[https://reefguide.org/pix/thumb2/moorishidol4...,5 to 8 in. (13 to 20 cm)
1,Longsnout Butterflyfish,Prognathodes aculeatus,https://reefguide.org/longsnoutbutter.html,Butterflyfishes,30-200 ft. (10-60 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Chaetodontidae,[https://reefguide.org/pix/thumb2/longsnoutbut...,2 to 3 in. (5 to 8 cm)
2,Orange-Banded Coralfish,Coradion chrysozonus,https://reefguide.org/coradionchrysozonus.html,Butterflyfishes,10-180 ft. (3-55 m),"Indo-West Pacific, Great Barrier Reef",Chaetodontidae,[https://reefguide.org/pix/thumb2/coradionchry...,Up to 6 in. (15 cm)
3,Two-Eyed Coralfish,Coradion melanopus,https://reefguide.org/coradionmelanopus.html,Butterflyfishes,30-100 ft. (10-30 m),Asian Pacific,Chaetodontidae,[https://reefguide.org/pix/coradionmelanopus1....,Up to 6 in. (15 cm)
4,Humphead Bannerfish,Heniochus varius,https://reefguide.org/humpheadbannerfish.html,Butterflyfishes,6-100 ft. (2-30 m),"West Pacific, Indonesia, French Polynesia",Chaetodontidae,[https://reefguide.org/pix/thumb2/humpheadbann...,5 to 8 in. (13 to 20 cm)


# Clean-Up

In [255]:
#consolidating both dfs
oceana=oceana_fact_df
reef=reef_df

In [256]:
#figuring out what species are common between 2 scrapes
test=pd.merge(reef, oceana, left_on=['scientific_name'], right_on=['scientific_name'], how='inner')
test.head()

Unnamed: 0,common_name_x,scientific_name,url_x,category_x,depth,distribution_x,family,images_x,size,category_y,common_name_y,distribution_y,images_y,taxonomy,url_y
0,Banded Butterflyfish,Chaetodon striatus,https://reefguide.org/bandedbutterfly.html,Butterflyfishes,10-60 ft. (3-18 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Chaetodontidae,[https://reefguide.org/pix/thumb2/bandedbutter...,3 to 5 in. (8 to 13 cm),Ocean Fishes,Banded Butterflyfish,Coral reefs,[https://oceana.org/sites/default/files/styles...,"Order Cottiformes (sculpins and relatives), Fa...",https://oceana.org/marine-life/ocean-fishes/ba...
1,Threespot Damselfish,Stegastes planifrons,https://reefguide.org/threespotdamsel.html,Damselfishes,1-130 ft. (0-40 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Pomacentridae,[https://reefguide.org/pix/thumb2/threespotdam...,3 to 4 in. (8 to 10 cm),Ocean Fishes,Threespot Damselfish,"The Caribbean Sea and adjacent waters, includi...",[https://oceana.org/sites/default/files/styles...,"Order Labriformes (wrasses and relatives), Fam...",https://oceana.org/marine-life/ocean-fishes/th...
2,False Clown Anemonefish,Amphiprion ocellaris,https://reefguide.org/falseclownanemonefish.html,Anemonefishes,3-45 ft. (1-15 m),Indo-West Pacific,Pomacentridae,[https://reefguide.org/pix/thumb2/falseclownan...,Up to 3.5 in. (9 cm),Ocean Fishes,Common Clownfish,Tropical to warm temperate latitudes of the ea...,[https://oceana.org/sites/default/files/styles...,"Order Labriformes (wrasses and relatives), Fam...",https://oceana.org/marine-life/ocean-fishes/co...
3,Goliath Grouper,Epinephelus itajara,https://reefguide.org/goliathgrouper.html,Groupers,10-100 ft. (3-30 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Serranidae,[https://reefguide.org/pix/thumb2/goliathgroup...,3 to 8 ft. (1 to 2.5 m),Ocean Fishes,Atlantic Goliath Grouper,Tropical to temperate latitudes in the Atlanti...,[https://oceana.org/sites/default/files/styles...,"Order Perciformes (perch-like fishes), Family ...",https://oceana.org/marine-life/ocean-fishes/at...
4,Nassau Grouper,Epinephelus striatus,https://reefguide.org/nassau.html,Groupers,20-100 ft. (6-30 m),"Caribbean, Bahamas, Florida, Bermuda, Brazil",Serranidae,"[https://reefguide.org/pix/thumb2/nassau1.jpg,...",1 to 2 ft. (30 to 60 cm),Ocean Fishes,Nassau Grouper,Coral reefs,[https://oceana.org/sites/default/files/styles...,"Order Carangiformes (jacks and relatives), Fam...",https://oceana.org/marine-life/ocean-fishes/na...


In [257]:
#consolidating images between 51 like species
test=test[['common_name_x', 'scientific_name', 'url_x', 'category_x', 'depth', 'distribution_x', 'family', 'images_x', 'images_y']]
test['images']=test['images_x']+test['images_y']

test['images'].values[0]

['https://reefguide.org/pix/thumb2/bandedbutterfly1.jpg',
 'https://reefguide.org/pix/thumb2/bandedbutterfly3.jpg',
 'https://reefguide.org/pix/thumb2/bandedbutterfly2.jpg',
 'https://reefguide.org/pix/thumb2/bandedbutterfly5.jpg',
 'https://reefguide.org/pix/thumb2/bandedbutterfly4.jpg',
 'https://oceana.org/sites/default/files/styles/lightbox_full/public/shutterstock_pixelnest_atlantic_wolffish.jpg?itok=7LzPOTg5']

In [258]:
test=test[['common_name_x',
           'scientific_name',
           'url_x', 'category_x',
           'depth', 'distribution_x',
           'family',
           'images']].rename(columns={'url_x':'url',
                                      'distribution_x': 'distribution',
                                      'category_x': 'category',
                                      'common_name_x':'common_name'})

test.head()

Unnamed: 0,common_name,scientific_name,url,category,depth,distribution,family,images
0,Banded Butterflyfish,Chaetodon striatus,https://reefguide.org/bandedbutterfly.html,Butterflyfishes,10-60 ft. (3-18 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Chaetodontidae,[https://reefguide.org/pix/thumb2/bandedbutter...
1,Threespot Damselfish,Stegastes planifrons,https://reefguide.org/threespotdamsel.html,Damselfishes,1-130 ft. (0-40 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Pomacentridae,[https://reefguide.org/pix/thumb2/threespotdam...
2,False Clown Anemonefish,Amphiprion ocellaris,https://reefguide.org/falseclownanemonefish.html,Anemonefishes,3-45 ft. (1-15 m),Indo-West Pacific,Pomacentridae,[https://reefguide.org/pix/thumb2/falseclownan...
3,Goliath Grouper,Epinephelus itajara,https://reefguide.org/goliathgrouper.html,Groupers,10-100 ft. (3-30 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Serranidae,[https://reefguide.org/pix/thumb2/goliathgroup...
4,Nassau Grouper,Epinephelus striatus,https://reefguide.org/nassau.html,Groupers,20-100 ft. (6-30 m),"Caribbean, Bahamas, Florida, Bermuda, Brazil",Serranidae,"[https://reefguide.org/pix/thumb2/nassau1.jpg,..."


In [259]:
species_to_remove=test['scientific_name']

In [260]:
#remove 51 overlapping species from reef and oceana
reef=reef[~reef['scientific_name'].isin(species_to_remove)]
oceana=oceana[~oceana['scientific_name'].isin(species_to_remove)]

print (reef.shape)
print (oceana.shape)
print (test.shape)

(1675, 9)
(177, 7)
(51, 8)


In [261]:
reef.columns

Index(['common_name', 'scientific_name', 'url', 'category', 'depth',
       'distribution', 'family', 'images', 'size'],
      dtype='object')

In [262]:
oceana.columns

Index(['category', 'common_name', 'distribution', 'images', 'scientific_name',
       'taxonomy', 'url'],
      dtype='object')

In [263]:
test.columns

Index(['common_name', 'scientific_name', 'url', 'category', 'depth',
       'distribution', 'family', 'images'],
      dtype='object')

In [264]:
oceana=oceana.rename(columns={'taxonomy':'family'})
oceana['depth']=' '

In [265]:
full=pd.concat([reef, test, oceana], sort=False)
full.drop_duplicates(subset=['scientific_name'], keep='first', inplace=True)
full.drop_duplicates(subset=['common_name'], keep='first', inplace=True)
print(full.shape)

full.head()

(1900, 9)


Unnamed: 0,common_name,scientific_name,url,category,depth,distribution,family,images,size
0,Moorish Idol,Zanclus cornutus,https://reefguide.org/moorishidol.html,Zanclidae,10-300 ft. (3-90 m),"Indo-Pacific, Hawaii, Gulf of California, Paci...",Zanclidae,[https://reefguide.org/pix/thumb2/moorishidol4...,5 to 8 in. (13 to 20 cm)
1,Longsnout Butterflyfish,Prognathodes aculeatus,https://reefguide.org/longsnoutbutter.html,Butterflyfishes,30-200 ft. (10-60 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",Chaetodontidae,[https://reefguide.org/pix/thumb2/longsnoutbut...,2 to 3 in. (5 to 8 cm)
2,Orange-Banded Coralfish,Coradion chrysozonus,https://reefguide.org/coradionchrysozonus.html,Butterflyfishes,10-180 ft. (3-55 m),"Indo-West Pacific, Great Barrier Reef",Chaetodontidae,[https://reefguide.org/pix/thumb2/coradionchry...,Up to 6 in. (15 cm)
3,Two-Eyed Coralfish,Coradion melanopus,https://reefguide.org/coradionmelanopus.html,Butterflyfishes,30-100 ft. (10-30 m),Asian Pacific,Chaetodontidae,[https://reefguide.org/pix/coradionmelanopus1....,Up to 6 in. (15 cm)
4,Humphead Bannerfish,Heniochus varius,https://reefguide.org/humpheadbannerfish.html,Butterflyfishes,6-100 ft. (2-30 m),"West Pacific, Indonesia, French Polynesia",Chaetodontidae,[https://reefguide.org/pix/thumb2/humpheadbann...,5 to 8 in. (13 to 20 cm)


In [266]:
# finding duplicates (species that share common_name and scientific_name) the merge above would have caused multiple copies (some incorrect)
check=full.groupby('common_name')
x=check.filter(lambda x: x['common_name'].count() > 1.)
x

Unnamed: 0,common_name,scientific_name,url,category,depth,distribution,family,images,size


In [267]:
##creating common categories
full['category'].unique()

array(['Zanclidae', 'Butterflyfishes', 'Damselfishes', 'Chromis',
       'Anemonefishes', 'Hamlets', 'Jacks', 'Mackerels and Tunas',
       'Porgies', 'Mojarras', 'Chubs', 'Spadefishes', 'Tarpons',
       'Barracudas', 'Bonnetmouths', 'Bonefishes', 'Mackerels',
       'Needlefishes', 'Halfbeaks', 'Groupers', 'Seabasses', 'Anthias',
       'Soapfishes', 'Basslets', 'Grunts', 'Sweetlips', 'Snappers',
       'Coral Breams', 'Emperors', 'Fusilier', 'Parrotfishes', 'Wrasses',
       'Squirrelfishes', 'Bigeyes', 'Cardinalfishes', 'Gobies',
       'Labrisomid Blennies', 'Tube Blennies', 'Combtooth Blennies',
       'Triplefin Blennies', 'Dragonets', 'Dartfishes', 'Dottybacks',
       'Longfins', 'Jawfishes', 'Lizardfishes', 'Sand Divers',
       'Sandperches', 'Tilefishes', 'Hawkfishes', 'Flatheads',
       'Velvetfishes', 'seamoths', 'Stargazers', 'Lefteye Flounders',
       'Righteye Flounders', 'Sand Flounders', 'Soles', 'Clingfishes',
       'Toadfishes', 'Batfishes', 'Frogfishes', 'Flyin

In [268]:
#Combining categories
Large_Oval_Fish=['Zanclidae', 'Butterflyfishes', 'Angelfishes', 'Surgeonfishes', 'Rabbitfishes']
Small_Oval_Fish=['Damselfishes', 'Chromis','Anemonefishes', 'Hamlets']
Silvery_Fish=['Jacks', 'Mackerels and Tunas','Porgies', 'Mojarras', 'Chubs', 'Spadefishes', 'Tarpons','Barracudas', 'Bonnetmouths', 'Bonefishes', 'Mackerels', 'Needlefishes', 'Halfbeaks']
Groupers_and_Seabasses= ['Groupers', 'Seabasses', 'Anthias', 'Soapfishes', 'Basslets']
Grunts_and_Snappers=['Grunts', 'Sweetlips', 'Snappers', 'Coral Breams', 'Emperors', 'Fusilier']
Parrotfishes_and_Wrasses=['Parrotfishes', 'Wrasses']
Red_Fish=['Squirrelfishes', 'Bigeyes', 'Cardinalfishes']
Gobies_and_Blennies=['Gobies','Labrisomid Blennies', 'Tube Blennies', 'Combtooth Blennies','Triplefin Blennies']
Crevice_Sand_and_Burrow_Dwellers=['Dragonets', 'Dartfishes', 'Dottybacks','Longfins', 'Jawfishes', 'Lizardfishes', 'Sand Divers','Sandperches', 'Tilefishes']
Bottom_Dweller_Fish=['Hawkfishes', 'Flatheads', 'Velvetfishes', 'seamoths', 'Stargazers', 'Lefteye Flounders','Righteye Flounders', 'Sand Flounders', 'Soles', 'Clingfishes', 'Toadfishes', 'Batfishes', 'Frogfishes', 'Flying Gurnards','Lionfishes', 'Scorpionfishes', 'Stonefishes', 'Waspfishes']
Odd_Shaped_Swimmers=['Triggerfishes', 'Filefishes', 'Boxfishes', 'Porcupinefishes','Pufferfishes', 'Drums', 'Goatfishes', 'Shrimpfishes','Trumpetfishes', 'Cornetfishes', 'Sweepers', 'Remoras','Catfishes']
Seahorses_and_pipefish=['Seahorses', 'Ghost Pipefishes', 'Pipefishes']
Eels=['Moray Eels', 'Snake Eels', 'Conger Eels', 'Garden Eels']
Sharks_and_Rays=['Manta Rays', 'Stingrays', 'Round Stingrays', 'Electric Rays','Sharks', 'Eagle Rays', 'Sharks & Rays']
Crustaceans=['Boxer Shrimps', 'Shrimps', 'Broken-back Shrimp',
       'Prawns', 'Ghost Shrimps', 'Spiny Lobsters', 'Reef Lobsters',
       'Slipper Lobsters', 'Squat Lobsters', 'Mantis Shrimps',
       'Skeleton Shrimps', 'Porcelain Crabs', 'Spider Crabs',
       'True Crabs', 'Swimming Crabs', 'Mud Crab', 'Box Crab',
       'Hermit Crabs', 'Right-Handed Hermits', 'Isopods', 'Horseshoe Crabs']
Worms=['Flatworms','Acoel Flatworms', 'Fireworms', 'Elongated Worms', 'Tube Worms',
       'Feather Duster Worms', 'Spaghetti Worms', 'Ribbon Worms',
       'Scale Worms', 'Horseshoe Worms', 'Honeycomb Worms']
Sea_Slugs=['Dorid Nudibranchs', 'Aeolid Nudibranchs', 'Arminid Nudibranchs',
       'Dendronotid Nudibranchs', 'Sap-Sucking Slugs', 'Headshield Slugs',
       'Sea Hares', 'Sidegill Slugs']
Gastropods=['Cowries', 'Allied Cowries','True Conchs', 'Helmet Shells', 'Triton Shells', 'Murex',
       'Volutes', 'Velutinids', 'Cone Shells', 'Augers Shells',
       'Nassa Mud Snails', 'Mitre Shells', 'Tuns Shells', 'Moon Snails',
       'Sundials', 'Wentletraps', 'Tulip Shells', 'Turban Shells',
       'Parasitic Snails', 'Worm Snails']
Cephalopods=['Squids', 'Cuttlefishes','Octopuses']
Bivalves_and_Chitons=['Giant Clams', 'Feather Oysters', 'Thorny Oysters',
       'Honeycomb Oysters', 'True Oysters', 'Scallops', 'File Clams',
       'Pen Shells', 'Chitons']
Cnidarians=['Sea Anemones', 'Tube-Dwelling Anemones',
       'Corallimorphs', 'Hydroid', 'Jellyfishes', 'Zoanthids',
       'Comb Jellies', 'Benthic Comb Jellies']
Echinoderms=['Sea Stars','Feather Stars', 'Basket Stars', 'Brittle Stars', 'Sea Urchins',
       'Sea Cucumbers']
Tunicates_and_Bryozoans=['Tunicates', 'Pelagic Tunicates', 'Bryozoans']
Sponges=['Sponges', 'Calcareous Sponges']
Coral=['Fire Corals', 'Lace Corals',
       'Gorgonians', 'Soft Corals', 'Sea Pens', 'Black Corals',
       'Stony Corals']
Algae=['Brown Algae', 'Green Algae', 'Red Algae']
Marine_Reptiles_and_Mammals=['Sea Snakes', 'Dolphins', 'Turtles', 'Manatees',  'Marine Mammals', 'Sea Turtles & Reptiles']
Sea_Birds=['Seabirds']


In [269]:
full['category']=full['category'].replace(Large_Oval_Fish, 'Large Oval Fish')
full['category']=full['category'].replace(Small_Oval_Fish, 'Small Oval Fish')
full['category']=full['category'].replace(Silvery_Fish, 'Silvery Fish')
full['category']=full['category'].replace(Groupers_and_Seabasses, 'Groupers and Seabasses')
full['category']=full['category'].replace(Grunts_and_Snappers, 'Grunts and Snappers')
full['category']=full['category'].replace(Parrotfishes_and_Wrasses, 'Parrotfishes and Wrasses')
full['category']=full['category'].replace(Red_Fish, 'Red Fish')
full['category']=full['category'].replace(Gobies_and_Blennies, 'Gobies and Blennies')
full['category']=full['category'].replace(Crevice_Sand_and_Burrow_Dwellers, 'Crevice, Sand, and Burrow Dwellers')
full['category']=full['category'].replace(Bottom_Dweller_Fish, 'Bottom Dwellers')
full['category']=full['category'].replace(Odd_Shaped_Swimmers, 'Odd Shaped Swimmers')
full['category']=full['category'].replace(Seahorses_and_pipefish, 'Seahorses and Pipefish')
full['category']=full['category'].replace(Eels, 'Eels')
full['category']=full['category'].replace(Sharks_and_Rays, 'Sharks and Rays')
full['category']=full['category'].replace(Crustaceans, 'Crustaceans')
full['category']=full['category'].replace(Worms, 'Worms')
full['category']=full['category'].replace(Sea_Slugs, 'Sea Slugs')
full['category']=full['category'].replace(Gastropods, 'Gastropods')
full['category']=full['category'].replace(Cephalopods, 'Cephalopods')
full['category']=full['category'].replace(Bivalves_and_Chitons, 'Bivalves and Chitons')
full['category']=full['category'].replace(Cnidarians, 'Cnidarians')
full['category']=full['category'].replace(Echinoderms, 'Echinoderms')
full['category']=full['category'].replace(Tunicates_and_Bryozoans, 'Tunicates and Bryozoans')
full['category']=full['category'].replace(Sponges, 'Sponges')
full['category']=full['category'].replace(Coral, 'Coral')
full['category']=full['category'].replace(Algae, 'Algae')
full['category']=full['category'].replace(Marine_Reptiles_and_Mammals, 'Marine Reptiles and Mammals')
full['category']=full['category'].replace(Sea_Birds, 'Sea Birds')


full['category'].unique()

array(['Large Oval Fish', 'Small Oval Fish', 'Silvery Fish',
       'Groupers and Seabasses', 'Grunts and Snappers',
       'Parrotfishes and Wrasses', 'Red Fish', 'Gobies and Blennies',
       'Crevice, Sand, and Burrow Dwellers', 'Bottom Dwellers',
       'Odd Shaped Swimmers', 'Seahorses and Pipefish', 'Eels',
       'Sharks and Rays', 'Crustaceans', 'Worms', 'Sea Slugs',
       'Gastropods', 'Cephalopods', 'Bivalves and Chitons', 'Cnidarians',
       'Echinoderms', 'Tunicates and Bryozoans', 'Sponges', 'Coral',
       'Algae', 'Marine Reptiles and Mammals',
       'Cephalopods, Crustaceans, & Other Shellfish',
       'Corals and Other Invertebrates', 'Ocean Fishes', 'Sea Birds'],
      dtype=object)

In [271]:
#categories 'Cephalopods, Crustaceans, & Other Shellfish','Corals and Other Invertebrates', 'Ocean Fishes' need to be corrected
correction=pd.read_csv('manual_category_fix.csv')
correction=pd.merge(correction, full, on=['scientific_name', 'common_name'], how='left')
correction=correction[['common_name', 
                       'scientific_name', 
                       'category_x', 
                       'url', 
                       'depth', 
                       'distribution', 
                       'family', 
                       'images', 
                       'size']].rename(columns={'category_x': 'category'})

correction.head()

Unnamed: 0,common_name,scientific_name,category,url,depth,distribution,family,images,size
0,Acorn Barnacle,Semibalanus balanoides,Crustaceans,https://oceana.org/marine-life/cephalopods-cru...,,Cold Temperate to Sub-Polar Latitudes of the N...,"Subphylum Crustacea (Crabs, Shrimps, and Relat...",[https://oceana.org/sites/default/files/styles...,
1,American Lobster,Homarus americanus,Crustaceans,https://oceana.org/marine-life/cephalopods-cru...,,Restricted to Temperate Latitudes of the North...,"Subphylum Crustacea (Crabs, Shrimps, and Relat...",[https://oceana.org/sites/default/files/styles...,
2,Antarctic Krill,Euphausia superba,Crustaceans,https://oceana.org/marine-life/cephalopods-cru...,,Circumpolar in the Southern Hemisphere,"Subphylum Crustacea (Crabs, Shrimps, and Relat...",[https://oceana.org/sites/default/files/styles...,
3,Argentine Shortfin Squid,Illex argentinus,Cephalopods,https://oceana.org/marine-life/cephalopods-cru...,,Subtropical to subpolar latitudes off Brazil a...,"Subphylum Crustacea (Crabs, Shrimps, and Relat...",[https://oceana.org/sites/default/files/styles...,
4,Blue King Crab,Paralithodes platypus,Crustaceans,https://oceana.org/marine-life/cephalopods-cru...,,Cold temperate and sub-polar latitudes of the ...,"Subphylum Crustacea (crabs, shrimps, and relat...",[https://oceana.org/sites/default/files/styles...,


In [272]:
#names to remove from full
species_to_remove=correction['scientific_name']
full=full[~full['scientific_name'].isin(species_to_remove)]
print(full.shape)
print(correction.shape)

(1816, 9)
(84, 9)


In [273]:
full=pd.concat([full, correction], sort=False)
print (full.shape)
full.columns

(1900, 9)


Index(['common_name', 'scientific_name', 'url', 'category', 'depth',
       'distribution', 'family', 'images', 'size'],
      dtype='object')

In [274]:
full=full[['common_name', 'scientific_name', 'url', 'category', 'depth', 'distribution', 'images', 'size']]
full.to_csv('full.csv', index=False)

count=0
categories=full['category'].unique()
for item in categories:
    count += len(full[full['category']==item])
    print(item, len(full[full['category']==item]), count)

Large Oval Fish 116 116
Small Oval Fish 75 191
Silvery Fish 46 237
Groupers and Seabasses 56 293
Grunts and Snappers 52 345
Parrotfishes and Wrasses 103 448
Red Fish 48 496
Gobies and Blennies 106 602
Crevice, Sand, and Burrow Dwellers 31 633
Bottom Dwellers 67 700
Odd Shaped Swimmers 105 805
Seahorses and Pipefish 27 832
Eels 45 877
Sharks and Rays 49 926
Crustaceans 195 1121
Worms 54 1175
Sea Slugs 129 1304
Gastropods 47 1351
Cephalopods 29 1380
Bivalves and Chitons 28 1408
Cnidarians 52 1460
Echinoderms 103 1563
Tunicates and Bryozoans 34 1597
Sponges 66 1663
Coral 116 1779
Algae 21 1800
Marine Reptiles and Mammals 48 1848
Sea Birds 26 1874
Ocean Fishes 26 1900


# LOAD IMAGES DICT INTO MONGO

In [275]:
#Create Connection to MongoDB
conn="mongodb://localhost:27017"
client=pymongo.MongoClient(conn)

In [276]:
#create MarineBuddy database on MongoDB
MarineBuddy = client["MarineBuddy"]

In [277]:
#creating an image dictionary
images=full[['common_name', 'scientific_name', 'category', 'images']]
images_dict=images.set_index('common_name').transpose().to_dict(orient='dict')

In [278]:
#create an images collection and load image_dict into collection
Images=MarineBuddy['images']
Images.update({}, images_dict, upsert=True)

  This is separate from the ipykernel package so we can avoid doing imports until


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [279]:
full.shape

(1900, 8)

# MODELING FOR SQL

In [281]:
#Species Table
full.head()

species_table=full[['common_name', 'scientific_name', '']]

Unnamed: 0,common_name,scientific_name,url,category,depth,distribution,images,size
0,Moorish Idol,Zanclus cornutus,https://reefguide.org/moorishidol.html,Large Oval Fish,10-300 ft. (3-90 m),"Indo-Pacific, Hawaii, Gulf of California, Paci...",[https://reefguide.org/pix/thumb2/moorishidol4...,5 to 8 in. (13 to 20 cm)
1,Longsnout Butterflyfish,Prognathodes aculeatus,https://reefguide.org/longsnoutbutter.html,Large Oval Fish,30-200 ft. (10-60 m),"Caribbean, Bahamas, Florida, Bermuda, Gulf of ...",[https://reefguide.org/pix/thumb2/longsnoutbut...,2 to 3 in. (5 to 8 cm)
2,Orange-Banded Coralfish,Coradion chrysozonus,https://reefguide.org/coradionchrysozonus.html,Large Oval Fish,10-180 ft. (3-55 m),"Indo-West Pacific, Great Barrier Reef",[https://reefguide.org/pix/thumb2/coradionchry...,Up to 6 in. (15 cm)
3,Two-Eyed Coralfish,Coradion melanopus,https://reefguide.org/coradionmelanopus.html,Large Oval Fish,30-100 ft. (10-30 m),Asian Pacific,[https://reefguide.org/pix/coradionmelanopus1....,Up to 6 in. (15 cm)
4,Humphead Bannerfish,Heniochus varius,https://reefguide.org/humpheadbannerfish.html,Large Oval Fish,6-100 ft. (2-30 m),"West Pacific, Indonesia, French Polynesia",[https://reefguide.org/pix/thumb2/humpheadbann...,5 to 8 in. (13 to 20 cm)
