In [2]:
 # Dependencies
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import pandas as pd
from splinter import Browser
import time

In [4]:
# Keeping pandas from truncating long strings
#source: https://coding-stream-of-consciousness.com/2020/04/17/python-pandas-stop-truncating-strings/
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
# URL of page to be scraped
url = 'https://dogtime.com/dog-breeds'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [6]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='list-item')


In [7]:
# loop over results to get article data

dog_breed_list = []
dog_links = []
for result in results:
    # scrape the dog types 
    dog_type = result.find('a', class_='list-item-title').text
    dog_breed_list.append(dog_type)
    
    #scrape the link
    link = result.a['href']
    dog_links.append(link)

In [8]:
#zip dog list and dog link lists together
combined_lists = list(zip(dog_breed_list, dog_links))


In [9]:
#make combined_lists into a dataframe
dog_info_df = pd.DataFrame(combined_lists)

In [10]:
#rename columns 
dog_info_df = dog_info_df.rename(columns = {0:"Breed", 1:"Link"})
dog_info_df

Unnamed: 0,Breed,Link
0,Afador,https://dogtime.com/dog-breeds/afador
1,Affenhuahua,https://dogtime.com/dog-breeds/affenhuahua
2,Affenpinscher,https://dogtime.com/dog-breeds/affenpinscher
3,Afghan Hound,https://dogtime.com/dog-breeds/afghan-hound
4,Airedale Terrier,https://dogtime.com/dog-breeds/airedale-terrier
...,...,...
372,Whoodle,https://dogtime.com/dog-breeds/whoodle
373,Wirehaired Pointing Griffon,https://dogtime.com/dog-breeds/wirehaired-pointing-griffon
374,Xoloitzcuintli,https://dogtime.com/dog-breeds/xoloitzuintli
375,Yorkipoo,https://dogtime.com/dog-breeds/yorkipoo


In [11]:
#output dataframe as csv file to load into SQL/relational database
dog_info_df.to_csv("data/dog_breed_links.csv", header=True, index = False)

In [12]:
#read csv of petfinder dogs
petfinder_dogs_df = pd.read_csv('data/petfinder_dogs.csv')
petfinder_dogs_df

Unnamed: 0,city,state,pet_id,name,breed
0,Andover,MA,48752074,Jade,Redbone Coonhound
1,New York,NY,48752069,Wish,Labrador Retriever
2,TULSA,OK,48752062,Molly,Shiba Inu
3,White Plains,NY,48751967,Brindi,Retriever
4,Wasco,CA,48751842,(FRECKLES) AVAILABLE AUG 24 2020,Greyhound
...,...,...,...,...,...
490,Dallas,TX,48750352,1105763,Pit Bull Terrier
491,Santa Fe,NM,48750344,ZUMA,Mixed Breed
492,Santa Cruz,CA,48750345,MUPPET*,Maltese
493,Santa Fe,NM,48750342,JEREMIAH,Mixed Breed


In [13]:
#get list of dogs from petfinder csv
#source: https://stackoverflow.com/questions/22341271/get-list-from-pandas-dataframe-column
petfinder_dog_list = petfinder_dogs_df['breed'].tolist()

In [14]:
search_list = []
for dog in petfinder_dog_list:
    link = dog_info_df.loc[dog_info_df['Breed']== dog, 'Link' ]
    link = str(link)
    if link != 'Series([], Name: Link, dtype: object)':
        link = link.split(' ')
        link = link[4]
        link = link.split('\n')
        search_list.append(link[0])
        

In [15]:
#removing duplicates
#source: https://www.geeksforgeeks.org/python-set-method/#:~:text=set()%20method%20is%20used,dintinct%20elements%2C%20commonly%20called%20Set.&text=Parameters%20%3A%20Any%20iterable%20sequence%20like,modified%20as%20passed%20as%20argument.
search_list = list(set(search_list)) 
search_list

['https://dogtime.com/dog-breeds/manchester-terrier',
 'https://dogtime.com/dog-breeds/maltese',
 'https://dogtime.com/dog-breeds/catahoula-leopard-dog',
 'https://dogtime.com/dog-breeds/german-shorthaired-pointer',
 'https://dogtime.com/dog-breeds/boston-terrier',
 'https://dogtime.com/dog-breeds/weimaraner',
 'https://dogtime.com/dog-breeds/akita',
 'https://dogtime.com/dog-breeds/german-shepherd-dog',
 'https://dogtime.com/dog-breeds/rat-terrier',
 'https://dogtime.com/dog-breeds/cocker-spaniel',
 'https://dogtime.com/dog-breeds/great-dane',
 'https://dogtime.com/dog-breeds/shetland-sheepdog',
 'https://dogtime.com/dog-breeds/australian-cattle-dog',
 'https://dogtime.com/dog-breeds/shiba-inu',
 'https://dogtime.com/dog-breeds/rhodesian-ridgeback',
 'https://dogtime.com/dog-breeds/jack-russell-terrier',
 'https://dogtime.com/dog-breeds/pomeranian',
 'https://dogtime.com/dog-breeds/great-pyrenees',
 'https://dogtime.com/dog-breeds/australian-shepherd',
 'https://dogtime.com/dog-breeds

In [16]:
executable_path = {'executable_path': 'chromedriver.exe'}


In [17]:
#intialize dataframe row lists
#source: https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
rows_list = []
#open browser
browser = Browser('chrome', **executable_path, headless=False)

#visit each link and extract rating data
for dog_link in search_list:
    #initialize list
    stars_list = []
    # Visit each dogtime url
    browser.visit(dog_link)
    
     # Scrape page into Soup
    html2 = browser.html
    soup2 = bs(html2, "lxml")
    
    dog_name_finder = soup2.find_all('div', class_= "breeds-single-content")[0]
    #find breed and append to list
    breed = dog_name_finder.find('h1').text
    stars_list.append(breed)
    
    #find area of page with details about star ratings
    check = soup2.find_all('div', class_= "characteristic-stars parent-characteristic")

    #iterate through each category and get rating
    for dog_type in check:
        #find star rating
        star = str(dog_type.find('div'))
        #add star rating to list
        stars_list.append(star)
    #add breed/star list to dataframe rows list
    rows_list.append(stars_list)

    

In [18]:
#create dataframe with all row lists
dog_trait_df = pd.DataFrame(rows_list)   

In [19]:
dog_trait_df

Unnamed: 0,0,1,2,3,4,5
0,Manchester Terrier,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
1,Maltese,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>"
2,Catahoula Leopard Dog,"<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
3,German Shorthaired Pointer,"<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
4,Boston Terrier,"<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
5,Weimaraner,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
6,Akita,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
7,German Shepherd Dog,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
8,Rat Terrier,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
9,Cocker Spaniel,"<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>"


In [20]:
#clean up ratings columns
#souce: https://datatofish.com/replace-values-pandas-dataframe/
#source: https://thispointer.com/pandas-loop-or-iterate-over-all-or-certain-columns-of-a-dataframe/#:~:text=iteritems()%20i.e.-,DataFrame.iteritems(),and%20column%20contents%20as%20series.&text=As%20there%20were%203%20columns%20so%203%20tuples%20were%20returned%20during%20iteration.
for column in dog_trait_df:
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-1"></div></div>'],1)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-2"></div></div>'],2)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-3"></div></div>'],3)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-4"></div></div>'],4)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-5"></div></div>'],5)

In [21]:
#rename columns
dog_trait_df = dog_trait_df.rename(columns = {0:"Breed",
                                             1:"Adaptability",
                                             2:"All_Around_Friendliness",
                                             3:"Health_and_Grooming_Needs",
                                             4:"Trainability",
                                             5:"Physical_Needs"})

In [22]:
#set breed as index
dog_trait_df.set_index('Breed', inplace = True)

In [23]:
dog_trait_df

Unnamed: 0_level_0,Adaptability,All_Around_Friendliness,Health_and_Grooming_Needs,Trainability,Physical_Needs
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Manchester Terrier,3,3,3,3,4
Maltese,3,4,2,3,3
Catahoula Leopard Dog,2,3,3,3,4
German Shorthaired Pointer,2,4,3,5,5
Boston Terrier,4,5,3,3,5
Weimaraner,3,4,4,4,5
Akita,3,2,4,4,4
German Shepherd Dog,3,4,4,4,5
Rat Terrier,3,4,3,4,5
Cocker Spaniel,4,4,3,4,3


In [24]:
#output dataframe as csv file to load into SQL/relational database
dog_trait_df.to_csv("data/dog_breed_characteristics.csv", header=True)