## Protectora Barcelona

Find a list of adoptable dogs od the Protectora de Barcelona https://www.protectorabcn.es/

## 0. Import libraries

In [1]:
import requests 
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import re
import time

import os
from urllib.parse import urlparse

# Options for DataFrame visualization:
pd.set_option('display.width', 0)
pd.set_option('display.max_columns', 40)

## 1. Download and display the content of robots.txt

In [2]:
def robot_txt():
    response = requests.get('https://www.protectorabcn.es/robots.txt')
    test = response.text
    print('robots.txt for https://www.protectorabcn.es/')
    print('============================================')
    result_data_set = {'DISALLOWED':[], 'ALLOWED':[]}

    for line in test.split('\n'):
        if line.startswith('Allow'):    # this is for allowed url
            result_data_set['ALLOWED'].append(line.split(': ')[1].split(' ')[0])    
        elif line.startswith('Disallow'):    # this is for disallowed url
            result_data_set['DISALLOWED'].append(line.split(': ')[1].split(' ')[0])    

    return result_data_set
robot_txt()

robots.txt for https://www.protectorabcn.es/


{'DISALLOWED': ['/*?orderby=',
  '/*?orderway=',
  '/*?tag=',
  '/*?id_currency=',
  '/*?search_query=',
  '/*?back=',
  '/*?n=',
  '/*&orderby=',
  '/*&orderway=',
  '/*&tag=',
  '/*&id_currency=',
  '/*&search_query=',
  '/*&back=',
  '/*&n=',
  '/*controller=addresses',
  '/*controller=address',
  '/*controller=authentication',
  '/*controller=cart',
  '/*controller=discount',
  '/*controller=footer',
  '/*controller=get-file',
  '/*controller=header',
  '/*controller=history',
  '/*controller=identity',
  '/*controller=images.inc',
  '/*controller=init',
  '/*controller=my-account',
  '/*controller=order',
  '/*controller=order-opc',
  '/*controller=order-slip',
  '/*controller=order-detail',
  '/*controller=order-follow',
  '/*controller=order-return',
  '/*controller=order-confirmation',
  '/*controller=pagination',
  '/*controller=password',
  '/*controller=pdf-invoice',
  '/*controller=pdf-order-return',
  '/*controller=pdf-order-slip',
  '/*controller=product-sort',
  '/*contr

The pages I want to scrape (https://www.protectorabcn.es/5-perros-en-adopcion) have no restrictions

## 2. Create a random User Agent generator

In [3]:
def get_random_ua():
    random_ua = ''
    ua_file = 'ua_file.txt'
    try:
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            # random.RandomState exposes a number of methods for generating random numbers drawn from a variety of probability distributions
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_ua = lines[int(idx)]
    except Exception as ex:
        print('Exception in random_ua')
        print(str(ex))
    finally:
        return random_ua

# 2. Web scraping

## 2.1 Main page dogs' adoptions

Find the number of adoptable dogs availables

In [4]:
def total_adoptable_dogs():    
    url = f'https://www.protectorabcn.es/5-perros-en-adopcion'
    user_agent = get_random_ua().replace('\n','')
    headers = {'user_agent':user_agent}
    
    html = requests.get(url, headers).text
    soup = bs(html, 'lxml')

    tot_dogs = soup.find('span', class_='heading-counter').text

    for dogs in tot_dogs.split():
        if dogs.isdigit():
            num_dogs = int(dogs)
            return num_dogs

In [5]:
print (f'At the moment, there are {total_adoptable_dogs()} dogs available for adoption')

At the moment, there are 37 dogs available for adoption


Find list of the adoptable dogs available with description and fotos

In [6]:
def list_adoptable_dogs():
    # Using the total number of dogs available for adoption
    url = f'https://www.protectorabcn.es/5-perros-en-adopcion?id_category=5&n={total_adoptable_dogs()}'

    user_agent = get_random_ua()
    headers = {'user_agent':user_agent}
    
    html = requests.get(url, headers).text
    soup = bs(html, 'lxml')
    
    dogs = soup.find_all('div', class_='product-container')
    
    adoptable_dogs = []

    for dog in dogs:
        adoptable_dogs.append ({
        'Nombre': dog.find('h5').find('a')['title'],
        'Caracteristicas': dog.find('span', class_='product-reference').text,
        'Foto': dog.find('img').get('src'),
        # using the libraries 'urllib.parse' and 'os'
        'URL' : (os.path.basename(urlparse(dog.find('a')['href']).path).replace('.html','').split("-")[1]),
        'Codigo': (''.join([dog for dog in dog.find('a')['href'] if dog.isdigit()]))
    })
        
    adoptable_dogs_df = pd.DataFrame(adoptable_dogs)
    
    return adoptable_dogs_df

In [7]:
# url = 'https://www.protectorabcn.es/perros-en-adopcion/78-nerea.html'

# user_agent = get_random_ua()
# headers = {'user_agent':user_agent}

# html = requests.get(url, headers).text
# soup = bs(html, 'lxml')


# code = soup.find('div', class_='product-image-container').find('a')['href']
# codes = [code.find('a')['href'] for code in soup.find_all('div', class_='product-container')]
# codigo = ''.join([c for c in codes if c.isdigit()])
# codes
# nombre_perro = code.rfind(r'\d+')
# # codigo
# nombre_perro


# import os
# from urllib.parse import urlparse

# # url = "http://photographs.500px.com/kyle/09-09-201315-47-571378756077.jpg"
# a = urlparse(url)
# print(a)                    # Output: /kyle/09-09-201315-47-571378756077.jpg
# print(os.path.basename(a.path).replace('.html','').split("-")[1])

In [8]:
df_dogs1 = list_adoptable_dogs()
df_dogs1

Unnamed: 0,Nombre,Caracteristicas,Foto,URL,Codigo
0,Nerea,Espectacular,https://server03.protectorabcn.es/7428-home_de...,nerea,78
1,Gina,Curiosa y elegante,https://server03.protectorabcn.es/7720-home_de...,gina,39
2,Horus,Una familia con dedicación,https://server01.protectorabcn.es/8345-home_de...,base,997
3,Muñeca,Preparada para volver a empezar,https://server01.protectorabcn.es/6424-home_de...,muneca,1291
4,Nut,¡¡ RESERVADO !!,https://server03.protectorabcn.es/8098-home_de...,nut,1362
5,Goliath,Necesita un hogar urgente,https://server03.protectorabcn.es/8344-home_de...,goliath,1481
6,Bruno,Una persona que lo entienda,https://server03.protectorabcn.es/8682-home_de...,bruno,1054
7,Soap,Buscando el amor,https://server03.protectorabcn.es/6761-home_de...,soap,1497
8,Skaner,Movido y alegre,https://server02.protectorabcn.es/7978-home_de...,skaner,1511
9,Riki,Noble y tierno,https://server01.protectorabcn.es/6930-home_de...,riki,1531


In [9]:
# adoptable_dogs = df_dogs1["Nombre"].tolist()
# adoptable_dogs

In [10]:
# URL_dogs = df_dogs1["URL"].tolist()
# URL_dogs

In [11]:
# code_dogs = df_dogs1["Codigo"].tolist()
# code_dogs

In [12]:
# adoptable_dogs = []

# for page in range(1,3):
#     url = f'https://www.protectorabcn.es/5-perros-en-adopcion?p={page}'
#     html = requests.get(url).text
#     soup = bs(html, 'html.parser')
#     dogs = soup.findAll('div', class_='product-container')


#     for dog in dogs:
#         adoptable_dogs.append ({
#             'name': dog.find('h5').find('a')['title'],
#             'description': dog.find('span', class_='product-reference').text,
#             'foto': dog.find('img').get('src')
#         })
    
# adoptable_dogs_df = pd.DataFrame(adoptable_dogs)
# adoptable_dogs_df

## 2.2 Adopbtable dogs' pages 

Create a random sleep interval generator

In [13]:
def delay():
    delay_list = [i for i in range(0,20,2)]
    sleep_interval = np.random.choice(delay_list)
    return sleep_interval

In [14]:
delay()

6

Extract all the codes from the main page to create all the URL for the single dogs

In [15]:
adoptable_dogs = df_dogs1["Nombre"].tolist()
URL_dogs = df_dogs1["URL"].tolist()
code_dogs = df_dogs1["Codigo"].tolist()

df_codes = pd.DataFrame((adoptable_dogs, URL_dogs, code_dogs))
df_codes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,Nerea,Gina,Horus,Muñeca,Nut,Goliath,Bruno,Soap,Skaner,Riki,Pepa,Tay,Basil,Fusterito,Tona,Joao,Toby,Neo,Simba,Tizón,Thor,Blai,Lluna,Osiris,Yakeline,Maggie,Tana,Livi,Aura,Maya,Chuche,Olivia,Cotó,Marley,Azabache,Wida,Yara
1,nerea,gina,base,muneca,nut,goliath,bruno,soap,skaner,riki,pepa,tay,basil,fusterito,tona,base,toby,neo,simba,tizon,thor,blai,lluna,osiris,yakeline,maggie,tana,livi,aura,maya,chuche,olivia,coto,marley,azabache,wida,yara
2,78,39,997,1291,1362,1481,1054,1497,1511,1531,1592,1353,1632,1173,1120,1677,1467,1744,1771,1772,1784,1825,1831,1653,1834,1840,1841,1842,1843,1844,1845,1846,1847,1849,1850,1851,1852


Function to scrape all the URLs

In [16]:
for name, code in zip(URL_dogs, code_dogs):
# for name in URL_dogs and for code in code_dogs:

    #Build URL for each dog
    base_url = 'https://www.protectorabcn.es/perros-en-adopcion/'
    url = base_url + code + '-' + name +'.html'
    print(url)


    #Be sure to pause
    time.sleep(2)

https://www.protectorabcn.es/perros-en-adopcion/78-nerea.html
https://www.protectorabcn.es/perros-en-adopcion/39-gina.html
https://www.protectorabcn.es/perros-en-adopcion/997-base.html
https://www.protectorabcn.es/perros-en-adopcion/1291-muneca.html
https://www.protectorabcn.es/perros-en-adopcion/1362-nut.html
https://www.protectorabcn.es/perros-en-adopcion/1481-goliath.html
https://www.protectorabcn.es/perros-en-adopcion/1054-bruno.html
https://www.protectorabcn.es/perros-en-adopcion/1497-soap.html
https://www.protectorabcn.es/perros-en-adopcion/1511-skaner.html
https://www.protectorabcn.es/perros-en-adopcion/1531-riki.html
https://www.protectorabcn.es/perros-en-adopcion/1592-pepa.html
https://www.protectorabcn.es/perros-en-adopcion/1353-tay.html
https://www.protectorabcn.es/perros-en-adopcion/1632-basil.html
https://www.protectorabcn.es/perros-en-adopcion/1173-fusterito.html
https://www.protectorabcn.es/perros-en-adopcion/1120-tona.html
https://www.protectorabcn.es/perros-en-adopcion

Test the first page of dogs in the list

In [17]:
# url = f'https://www.protectorabcn.es/perros-en-adopcion/78-nerea.html'
# user_agent = get_random_ua()
# headers = {'user_agent':user_agent}
# html = requests.get(url, headers).text
# soup = bs(html, 'lxml')

In [18]:
# dog's name
# name = soup.find('div', class_='product-title').find('h1').text
# name

In [19]:
# past = [descr.text for descr in soup.find('div', dir='ltr')]
# past 

In [20]:
# future = [descr.text.replace('Si estás interesado en conocerla puedes enviarnos un email a info@protectorabcn.es\xa0','') for descr in soup.find_all('div', dir='ltr')[1]]
# future

In [21]:
# information = soup.find_all('div', dir='ltr')
# general_info = []
   
# name = soup.find('div', class_='product-title').find('h1').text
# past = information[0].text
# future = information[1].text.replace('Si estás interesado en conocerla puedes enviarnos un email a info@protectorabcn.es\xa0','') 
# general_info.append((name, past,future))

# df1 =pd.DataFrame(general_info, columns =['Nombre', 'Pasado', 'Futuro'])
# df1

In [22]:
# # dog's table
# tables = soup.find_all('table')
# tables
# table = tables[0]
# tab_data = [[cell.text for cell in row.find_all(["th","td"])]
#                         for row in table.find_all("tr")]
# tab_data

# df2 = pd.DataFrame(tab_data).T
# df2.columns = df2.iloc[0,:]
# df2.drop(index=0,inplace=True)
# df2.reset_index(drop = True, inplace = True)
# df2

In [23]:
# df_tot = pd.concat([df1,df2], axis=1, sort=False)
# df_tot = df_tot[['Nombre', 'Animal', 'Sexo', 'Medidas', 'Edad', 'Color', 'Raza', 'Nacimiento', 'Entrada', 'Pasado', 'Futuro', 'Estado']]
# df_tot

In [24]:
def dog_scraper(url):
    user_agent = get_random_ua()
    headers = {'user_agent':user_agent}
    html = requests.get(url, headers).text
    soup = bs(html, 'lxml')
    
    # create database with information about: name, past and future of the dogs
    information = soup.find_all('div', dir='ltr')
    general_info = []

    name = soup.find('div', class_='product-title').find('h1').text
    past = information[0].text
    future = information[1].text.replace('Si estás interesado en conocerla puedes enviarnos un email a info@protectorabcn.es\xa0','') 
    general_info.append((name, past,future))

    df1 =pd.DataFrame(general_info, columns =['Nombre', 'Pasado', 'Futuro'])
    df1
    
    # create database with data from the table
    tables = soup.find_all('table')
    tables
    table = tables[0]
    tab_data = [[cell.text for cell in row.find_all(["th","td"])]
                            for row in table.find_all("tr")]

    df2 = pd.DataFrame(tab_data).T
    df2.columns = df2.iloc[0,:]
    df2.drop(index=0,inplace=True)
    df2.reset_index(drop = True, inplace = True)
    df2
    
    # join two dataframes
    df_dogs2 = pd.concat([df1,df2], axis=1, sort=False)
    df_dogs2 = df_dogs2[['Nombre', 'Animal', 'Sexo', 'Medidas', 'Edad', 'Color', 'Raza', 'Nacimiento', 'Entrada', 'Pasado', 'Futuro', 'Estado']]
    return df_dogs2

In [25]:
url_nerea = 'https://www.protectorabcn.es/perros-en-adopcion/78-nerea.html'
dog_scraper(url_nerea)

Unnamed: 0,Nombre,Animal,Sexo,Medidas,Edad,Color,Raza,Nacimiento,Entrada,Pasado,Futuro,Estado
0,Nerea,Perros,Hembra,Mediano,Adulto,Marrón claro,Mestizo,2010,02-2014,Nerea fue rescatada de un síndrome de Noé hace...,"Debido a su pasado, Nerea necesita estar acomp...",En Adopción
