# Crawling data

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from csv import writer
import json
import os
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
url = "https://a-z-animals.com/animals/"

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
res = requests.get(url, headers=headers)

soup = BeautifulSoup(res.content, 'html.parser')

In [3]:
#Getting the name and links of animals from the main website
animals_url = []

all_branches = soup.find_all("li",{"class":"list-item col-md-4 col-sm-6"})
for branch in all_branches:
    
    url = branch.find("a")['href'] #gets the link
    animals_url.append(url)

In [4]:
with open('animals_URL.txt', 'w') as f:
    for url in animals_url:
        f.write(url + '\n')
# © https://medium.com/swlh/web-scraping-all-the-links-with-python-fbefa0472753

In [5]:
len(animals_url)

2461

In [6]:
All_status = ['Extinct', 'Critically Endangered', 'Endangered', 'Vulnerable', 'Near Threatened',
                 'Conservation Dependent', 'Least Concern', 'Data Deficient', 'Not Evaluated', 'Not Listed']
    
All_Continents = ['Africa', 'Antarctica', 'Asia', 'Central-America', 'Eurasia', 'Europe','North-America', 
                  'Ocean', 'Oceania', 'South-America']

In [7]:
def get_animal_page(animal_url):
    
    response_animal = requests.get(animal_url, headers=headers)
    animal_soup = BeautifulSoup(response_animal.content, "html.parser")
    return animal_soup

In [8]:
def extract_data(soup):
    
    animal_details = {}
    
    #----------------------Animal Name---------------------
    
    try:
        animal_name = soup.find('h1', attrs = {"class": "has-text-align-center has-custom-size text-white"})
        animal_details['Name'] = animal_name.text
    except:
        animal_details['Name'] = None
        
    #------------------Animal Details #1------------------
    
    tag_1 = [tag_1.get_text() for tag_1 in soup.findAll('dt', attrs = {"class":"col-sm-3 text-md-right"})]
    detail_1 = [detail_1.get_text() for detail_1 in soup.findAll('dd', attrs = {"class":"col-sm-9"})]
    
    for i in range(0, len(tag_1)):
        animal_details[tag_1[i]] = detail_1[i]
    
    #------------------Animal Details #2------------------
    
    tag_2 = [tag_2.get_text() for tag_2 in soup.findAll('dt', attrs = {"class":"col-sm-6 text-md-right"})]
    detail_2 = [detail_2.get_text() for detail_2 in soup.findAll('dd', attrs = {"class":"col-sm-6"})]
    
    for j in range(0, len(tag_2)):
        animal_details[tag_2[j]] = detail_2[j]
           
    #--------------Status (Endangered or NOT)-------------- 
    
    detail_3 = [detail_3.get_text() for detail_3 in soup.findAll('ul', attrs = {"class":"list-unstyled"})]    
    try:
        for i in range(0, len(detail_3)):
            if detail_3[i] in All_status:
                animal_details['Endangered'] = detail_3[i] 
    except:
        animal_details['Endangered'] = None
        
    #----------------Continents-----------------------------
    
    try:                            
        matches = [j for j in detail_3 if any(k in j for k in All_Continents)] #checks in detail_3 is there any continent from Continents list if yes add it to matches
        if matches:
            animal_details['Continents'] = matches[0] #put the first string   
    except:     
        animal_details['Continents'] = None
        
    return animal_details   

In [9]:
all_animals_data = []
animals_ERROR = []

for animal_link in animals_url:
    try:
        animal_soup = get_animal_page(animal_link)
        all_animals_data.append(extract_data(animal_soup))
    except:
        animals_ERROR.append(animal_link)     
print("Function ended")  

Function ended


In [10]:
animals_ERROR

['https://a-z-animals.com/animals/bolivian-anaconda/',
 'https://a-z-animals.com/animals/green-anaconda/',
 'https://a-z-animals.com/animals/yellow-anaconda/']

In [11]:
#Writing the data to txt file
json_str = json.dumps(all_animals_data)

with open('animals_data.txt', 'w') as f:
    f.write(json_str)    
# © https://medium.com/swlh/web-scraping-all-the-links-with-python-fbefa0472753