In [14]:
#Importing modules
import os
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import pymongo

In [15]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [16]:
# Define database and collection
db = client.pets_db
collection = db.items

### Function definition

In [31]:
def get_pages_count(soup):
    '''
    Return the number of pages to visit for APA that contain dog information
    
    Params: soup --> BeautifulSoup object
    
    Return: pet_entry --> number of pages to iterate
    '''
    #Get the pagination division
    pagination = soup.find('div', class_='pagination')
    pages = list()
    for page in pagination.find_all('a',class_='page'):
        pages.append(page.text)
    return(int(pages[-1]))

In [32]:
def get_pet_info_by_url(soup):
    '''
    Parse the dog information into a dictionary to append to Mongo DB
    
    Params: soup --> BeautifulSoup object
    
    Return: pet_entry --> dictionary will all pat information
    '''
    #Main dictionary that will hold the data base entry
    pet_entry = list()
    # Get all dog information
    all_dogs_per_page = soup.find_all('div', class_='large-tile')
    #Parse teh data
    for dog in all_dogs_per_page:
        #Dictonary used to store the data
        pet_data = dict()
        #Get the text based information
        #[-] Name
        pet_name = dog.find('h3').find('a').text
        #[-] Id
        pet_id = dog.find('h6').text
        #[-] Listed information : Age, Sex, Breed
        pet_info = dog.find('ul').find_all("li")
        #[--] Age , index 0
        age_data = pet_info[0].text.split('Months')[0].split('Years')
        #Conver the age in fraction of years
        pet_age = round(int(age_data[0].strip()) + (int(age_data[1].strip()) /12.0),2)
        #[--] Sex , index 1
        pet_sex = pet_info[1].text
        #[--] Breed , index 2
        pet_breed = pet_info[2].text
        #Build the data into the dictionary
        #Get the pet dtats , we need to translate from stars to numbers
        pet_location = pet_info[3].text
        pet_data.update({'pet_name':pet_name})
        pet_data.update({'pet_age':pet_age})
        pet_data.update({'pet_sex':pet_sex})
        pet_data.update({'pet_breed':pet_breed})
        pet_data.update({'pet_location':pet_location})
        #Count the starts to get the data
        possible_stat_list = dog.find_all('div',class_='stats-panel')
        for possible_stat in possible_stat_list:
            stat = possible_stat.find_all('div',class_='flex')
            for category in stat:
                category_name = category.find('span',class_='stats-text').text
                category_score = 0
                stars = category.find_all('img')
                for star in stars:
                    if('star full' == star['alt']):
                         category_score += 1
                #Update the dictionary for each stat
                pet_data.update({category_name:category_score})
        pet_data.update({"pet_id":pet_id})
        #Update the main dictionary with the ID as main key and return
        pet_entry.append({pet_id:pet_data})
    return pet_entry

In [39]:
def get_all_apa_data():
    '''
    Parse all dog information for APA dogs
    
    Params: None
    
    Return: pet_entry --> dictionary with the information
    '''
    #main page for DOG adoption APA
    main_url = url = 'https://www.austinpetsalive.org/adopt/dogs'
    #Base url to add the page to scrape
    bas_url_4_page = 'https://www.austinpetsalive.org/adopt/dogs/p'
    #Dictionary used to store all the data scraped
    pet_data = list()
    #create the browser object
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    #Create the defult list of URLs to vist and add the first page
    url_list = list(main_url)
    #Visit the main page to collect number of pages and the first data scrape
    browser.visit(main_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(browser.html, 'html.parser')
    #Get the number fo pages to scrape besides the main one
    total_pages = get_pages_count(soup)
    #Update the data of the main entry page
    pet_data = pet_data + get_pet_info_by_url(soup)
    #Iterate over the next pages to get the complete data, we start in 2 bacause
    #index 1 is the pain page
    for page_idx in range(2,total_pages+1):
        #Build the new page
        page_url = bas_url_4_page + str(page_idx)
        #Visit the new page
        browser.visit(page_url)
        #parse teh data
        soup = BeautifulSoup(browser.html, 'html.parser')
        #Update the information
        pet_data = pet_data +get_pet_info_by_url(soup)
    #Clos ethe browser
    browser.quit()
    return pet_data
    
    
    
    

###  Scrape the data

In [40]:
#Get the information
apa_data = get_all_apa_data()

In [41]:
#Display the information
apa_data

[{'APA-A-36615': {'pet_name': 'Sparky',
   'pet_age': 6.5,
   'pet_sex': 'Male',
   'pet_breed': 'Terrier, Pit Bull / Mix',
   'pet_location': 'Foster',
   'Dog': 1,
   'Cat': 1,
   'Child': 2,
   'Home  Alone ': 4,
   'pet_id': 'APA-A-36615'}},
 {'APA-A-45785': {'pet_name': 'Rio',
   'pet_age': 3.75,
   'pet_sex': 'Male',
   'pet_breed': 'Mixed Breed (Large) / Great Dane',
   'pet_location': 'TLAC',
   'Dog': 1,
   'Cat': 0,
   'Child': 1,
   'Home  Alone ': 5,
   'pet_id': 'APA-A-45785'}},
 {'APA-A-47726': {'pet_name': 'Twister',
   'pet_age': 5.67,
   'pet_sex': 'Male',
   'pet_breed': 'Shepherd, Anatolian / Mix',
   'pet_location': 'TLAC',
   'Dog': 1,
   'Cat': 2,
   'Child': 1,
   'Home  Alone ': 5,
   'pet_id': 'APA-A-47726'}},
 {'APA-A-49698': {'pet_name': 'Beryl',
   'pet_age': 6.58,
   'pet_sex': 'Male',
   'pet_breed': 'Retriever, Yellow Labrador / Mix',
   'pet_location': 'TLAC',
   'Dog': 2,
   'Cat': 2,
   'Child': 1,
   'Home  Alone ': 4,
   'pet_id': 'APA-A-49698'}},
 {

In [43]:
for pet in apa_data:
    print(pet)
    collection.insert_one(pet)

{'APA-A-36615': {'pet_name': 'Sparky', 'pet_age': 6.5, 'pet_sex': 'Male', 'pet_breed': 'Terrier, Pit Bull / Mix', 'pet_location': 'Foster', 'Dog': 1, 'Cat': 1, 'Child': 2, 'Home  Alone ': 4, 'pet_id': 'APA-A-36615'}}
{'APA-A-45785': {'pet_name': 'Rio', 'pet_age': 3.75, 'pet_sex': 'Male', 'pet_breed': 'Mixed Breed (Large) / Great Dane', 'pet_location': 'TLAC', 'Dog': 1, 'Cat': 0, 'Child': 1, 'Home  Alone ': 5, 'pet_id': 'APA-A-45785'}}
{'APA-A-47726': {'pet_name': 'Twister', 'pet_age': 5.67, 'pet_sex': 'Male', 'pet_breed': 'Shepherd, Anatolian / Mix', 'pet_location': 'TLAC', 'Dog': 1, 'Cat': 2, 'Child': 1, 'Home  Alone ': 5, 'pet_id': 'APA-A-47726'}}
{'APA-A-49698': {'pet_name': 'Beryl', 'pet_age': 6.58, 'pet_sex': 'Male', 'pet_breed': 'Retriever, Yellow Labrador / Mix', 'pet_location': 'TLAC', 'Dog': 2, 'Cat': 2, 'Child': 1, 'Home  Alone ': 4, 'pet_id': 'APA-A-49698'}}
{'APA-A-43905': {'pet_name': 'Brindle', 'pet_age': 5.5, 'pet_sex': 'Male', 'pet_breed': 'Hound / Hound', 'pet_location

{'APA-A-71768': {'pet_name': 'Scott', 'pet_age': 2.42, 'pet_sex': 'Male', 'pet_breed': 'Retriever, Black Labrador / Mixed Breed (Medium)', 'pet_location': 'TLAC', 'Dog': 2, 'Cat': 2, 'Child': 2, 'Home  Alone ': 3, 'pet_id': 'APA-A-71768'}}
{'APA-A-73983': {'pet_name': 'Zephyr', 'pet_age': 6.25, 'pet_sex': 'Male', 'pet_breed': 'Terrier, Bull / Chinese Shar-Pei', 'pet_location': 'TLAC', 'Dog': 2, 'Cat': 0, 'Child': 0, 'Home  Alone ': 4, 'pet_id': 'APA-A-73983'}}
{'APA-A-73981': {'pet_name': 'Sage', 'pet_age': 4.25, 'pet_sex': 'Female', 'pet_breed': 'Bulldog, American', 'pet_location': 'TLAC', 'Dog': 2, 'Cat': 0, 'Child': 0, 'Home  Alone ': 0, 'pet_id': 'APA-A-73981'}}
{'APA-A-73987': {'pet_name': 'Flute', 'pet_age': 1.25, 'pet_sex': 'Female', 'pet_breed': 'Pyrenees, Great / Retriever, Labrador', 'pet_location': 'TLAC', 'Dog': 3, 'Cat': 3, 'Child': 3, 'Home  Alone ': 4, 'pet_id': 'APA-A-73987'}}
{'APA-A-74046': {'pet_name': 'Tempo', 'pet_age': 0.17, 'pet_sex': 'Male', 'pet_breed': 'Retrie

{'APA-A-47378': {'pet_name': 'Abby', 'pet_age': 2.58, 'pet_sex': 'Female', 'pet_breed': 'Shepherd, German / Rottweiler', 'pet_location': 'TLAC', 'Dog': 3, 'Cat': 0, 'Child': 0, 'Home  Alone ': 3, 'pet_id': 'APA-A-47378'}}
{'APA-A-75881': {'pet_name': 'Thor', 'pet_age': 3.0, 'pet_sex': 'Male', 'pet_breed': 'Retriever, Chocolate Labrador', 'pet_location': 'Foster', 'Dog': 3, 'Cat': 0, 'Child': 2, 'Home  Alone ': 4, 'pet_id': 'APA-A-75881'}}
{'APA-A-75888': {'pet_name': 'Kodiak', 'pet_age': 0.25, 'pet_sex': 'Male', 'pet_breed': 'Foxhound, American / Shepherd', 'pet_location': 'Foster', 'Dog': 4, 'Cat': 4, 'Child': 0, 'Home  Alone ': 2, 'pet_id': 'APA-A-75888'}}
{'APA-A-75889': {'pet_name': 'Glacier', 'pet_age': 0.25, 'pet_sex': 'Female', 'pet_breed': 'Foxhound, American / Shepherd', 'pet_location': 'Foster', 'Dog': 4, 'Cat': 4, 'Child': 0, 'Home  Alone ': 2, 'pet_id': 'APA-A-75889'}}
{'APA-A-75890': {'pet_name': 'Gobi', 'pet_age': 0.25, 'pet_sex': 'Female', 'pet_breed': 'Foxhound, America

In [44]:
#Get the number of entries
len(apa_data)

379