In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
df = pd.read_csv('FishBase.csv')

In [4]:
fishRows = []
i=0
for idx in range(0,len(df)):
    fish = {}
    fish['ID'] = fId = df['ID'][idx]
    fish['Species Name'] = df['Species'][idx]
    
    response = requests.get("https://www.fishbase.de/summary/{}".format(fId)).text
    soup = BeautifulSoup(response,'html.parser')
    
    print(i,fId)
    i+=1
    
    if len(soup.find_all('p'))==0:
        continue
    
    #----CommonName----
    try:
        fish['Common Name'] = soup.find(class_="sheader2").text.strip()
    except:
        pass
        
        
        
    #----AuthorName & Year----
    author = [x.text for x in soup.find_all(class_='sheader6 noLinkDesign')]
    fish['Authors'] = ', '.join(author[:-1])
    fish['Year'] = author[-1]
    
    
    #----Genus----
    fish['Genus'] = df['Species'][idx].split()[0]

    
    #----Higher Taxon----
    for x in soup.find_all(class_='sciname'):
        try:
            fish[x['title']] = x.text
        except:
            pass
            
            
    summary = soup.find_all(class_='smallSpace')
    #----Environment,Habitat,Migration----
    env_text = summary[1].text.strip()
    env_list = env_text.split(';')
    fish['Environment'] = env_list[0].strip()
    try:
        fish['Habitat'] = env_list[1].split('.')[0].strip()
        fish['Migration'] = env_list[2].split('(Ref.')[0].strip()
    except:
        pass
    
    
    
    #----Depth Range----
    depSt = env_text.find('depth range')
    depEn = env_text.find('m',depSt)
    if depSt!=-1:
        fish['Depth Range'] = env_text[depSt+12:depEn]

    
    
    #----pH Range----
    pHSt = env_text.find('pH range:')
    if pHSt!=-1:
        pHEn = env_text.find(';',pHSt)
        fish['pH'] = env_text[pHSt+9:pHEn].strip()

    
    
    #----dH Range----
    dHSt = env_text.find('dH range:')
    if dHSt!=-1:
        dHEn1 = env_text.find(';',dHSt)
        dHEn2 = env_text.find('.',dHSt)
        fish['dH'] = env_text[dHSt+9:min(dHEn1,dHEn2)].strip()

    
    
    #----IUCN Red List Status & Threat----
    boxlist = [x.parent.text.strip().split('(')[0] for x in soup.find_all(class_='box')]
    fish['IUCN Status'] = boxlist[0]
    try:
        fish['Threat Level'] = boxlist[1]
    except:
        pass

    
    
    #----Climate----
    if "Tropical" in env_text:
        fish['Climate'] = "Tropical"
    elif "Subtropical" in env_text:
        fish['Climate'] = "Subtropical"
    elif "Temperate" in env_text:
        fish['Climate'] = "Temperate"
    elif "Polar" in env_text:
        fish['Climate'] = "Polar"

    
    
    #----Distribution----
    dist = summary[2].text.strip().split(':')
    fish['Area'] = dist[0].split('(Ref.')[0].split('.')[0]
    
    
    #----Max Length----
    physChars = summary[3].text.strip()
    lenSt = physChars.find("Max length :")
    if lenSt!=-1:
        lenEnd = physChars.find("cm",lenSt)
        fish['Max Length'] = physChars[lenSt+12:lenEnd].strip()

    
    
    #----Max Published Weight----
    wtSt = physChars.find("max. published weight: ")
    if wtSt!=-1:
        wtEnd = physChars.find("g",wtSt+23)
        fish['Max Weight'] = physChars[wtSt+23:wtEnd+1].strip()

    
    
    #----Max Reported Age----
    ageSt = physChars.find("max. reported age:")
    if ageSt!=-1:
        ageEnd = physChars.find("year",ageSt)
        fish['Max Age'] = physChars[ageSt+18:ageEnd].strip()

    
    
    #----Mode of Reproduction----
    reproResponse = requests.get("https://www.fishbase.de/Reproduction/FishReproSummary.php?ID={}".format(fId)).text
    reproSoup = BeautifulSoup(reproResponse,'html.parser')
    modeTag = reproSoup.find('tr',{"class":""})
    try:
        fish['Mode of Reproduction'] = list(modeTag.stripped_strings)[1]
    except:
        pass

    
    
    #----Fertilization Type----
    try:
        fish['Fertilization'] = list(modeTag.next_sibling.next_sibling.stripped_strings)[1]
    except:
        pass

    
    #----Human Uses----
    uses = soup.find_all(class_='smallSpace')[12].text.strip()
    usesDict = {}
    if uses != '' and uses[:3] != 'FAO':
        for use in uses.split(';'):
            attr = use.split(':')
            usesDict[attr[0].strip().lower()] = attr[1].strip()
        fish.update(usesDict)

    

    #----Image----
    fish['Image'] = 'https://www.fishbase.de/'+soup.find('a', {'style':"text-decoration:none;"}).img['src']
    
    
    #----Image Credits----
    picDes = soup.find(class_='slabel8').text
    picSt = picDes.find("icture by\r\n\t\t\t\t\t\r\n\t\t\t\t\t")
    picEnd = picDes.find('\n',picSt+24)
    if picSt!=-1:
        fish['Img Credits'] = picDes[picSt+23:picEnd].strip()
    
    
    #----Main Reference----
    mainRef = soup.find('span',string = 'References')
    fish['Reference'] = mainRef.parent.parent.parent.div.text.strip()
    
    #----Environment Section----
    fish['Env Section'] = env_text
            
    fishRows.append(fish)
    print("-----")

0 11898
-----
1 26221
-----


In [5]:
masterFish = pd.DataFrame.from_dict(fishRows)

In [None]:
masterFish.to_csv("Master.csv", index = False )

In [None]:
masterFish.to_pickle("Master.pkl")