In [27]:
 # Dependencies
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import pandas as pd
from splinter import Browser
import time
from sqlalchemy import create_engine
from config import user
from config import pw

In [2]:
# Keeping pandas from truncating long strings
#source: https://coding-stream-of-consciousness.com/2020/04/17/python-pandas-stop-truncating-strings/
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
# URL of page to be scraped
url = 'https://dogtime.com/dog-breeds'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response.text, 'lxml')

In [4]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='list-item')


In [5]:
# loop over results to get article data

dog_breed_list = []
dog_links = []
for result in results:
    # scrape the dog types 
    dog_type = result.find('a', class_='list-item-title').text
    dog_breed_list.append(dog_type)
    
    #scrape the link
    link = result.a['href']
    dog_links.append(link)

In [6]:
#zip dog list and dog link lists together
combined_lists = list(zip(dog_breed_list, dog_links))


In [34]:
#make combined_lists into a dataframe
dog_info_df = pd.DataFrame(combined_lists)

In [35]:
#rename columns 
dog_info_df = dog_info_df.rename(columns = {0:"breed", 1:"link"})
dog_info_df

Unnamed: 0,breed,link
0,Afador,https://dogtime.com/dog-breeds/afador
1,Affenhuahua,https://dogtime.com/dog-breeds/affenhuahua
2,Affenpinscher,https://dogtime.com/dog-breeds/affenpinscher
3,Afghan Hound,https://dogtime.com/dog-breeds/afghan-hound
4,Airedale Terrier,https://dogtime.com/dog-breeds/airedale-terrier
...,...,...
372,Whoodle,https://dogtime.com/dog-breeds/whoodle
373,Wirehaired Pointing Griffon,https://dogtime.com/dog-breeds/wirehaired-pointing-griffon
374,Xoloitzcuintli,https://dogtime.com/dog-breeds/xoloitzuintli
375,Yorkipoo,https://dogtime.com/dog-breeds/yorkipoo


In [9]:
#output dataframe as csv file to load into SQL/relational database
dog_info_df.to_csv("data/dog_breed_links.csv", header=True, index = False)

In [10]:
#OPTIONAL STEP (see README section Extract)
#read csv of petfinder dogs
petfinder_dogs_df = pd.read_csv('data/petfinder_dogs.csv')
petfinder_dogs_df

Unnamed: 0,city,state,pet_id,name,breed
0,Andover,MA,48752074,Jade,Redbone Coonhound
1,New York,NY,48752069,Wish,Labrador Retriever
2,TULSA,OK,48752062,Molly,Shiba Inu
3,White Plains,NY,48751967,Brindi,Retriever
4,Wasco,CA,48751842,(FRECKLES) AVAILABLE AUG 24 2020,Greyhound
...,...,...,...,...,...
490,Dallas,TX,48750352,1105763,Pit Bull Terrier
491,Santa Fe,NM,48750344,ZUMA,Mixed Breed
492,Santa Cruz,CA,48750345,MUPPET*,Maltese
493,Santa Fe,NM,48750342,JEREMIAH,Mixed Breed


In [11]:
#get list of dogs from petfinder csv
#source: https://stackoverflow.com/questions/22341271/get-list-from-pandas-dataframe-column
petfinder_dog_list = petfinder_dogs_df['breed'].tolist()

In [12]:
search_list = []
for dog in petfinder_dog_list:
    link = dog_info_df.loc[dog_info_df['breed']== dog, 'link' ]
    link = str(link)
    if link != 'Series([], Name: Link, dtype: object)':
        link = link.split(' ')
        link = link[4]
        link = link.split('\n')
        search_list.append(link[0])
        

In [13]:
#removing duplicates
#source: https://www.geeksforgeeks.org/python-set-method/#:~:text=set()%20method%20is%20used,dintinct%20elements%2C%20commonly%20called%20Set.&text=Parameters%20%3A%20Any%20iterable%20sequence%20like,modified%20as%20passed%20as%20argument.
search_list = list(set(search_list)) 
search_list

['https://dogtime.com/dog-breeds/boxer',
 'https://dogtime.com/dog-breeds/dachshund',
 'https://dogtime.com/dog-breeds/yorkshire-terrier',
 'https://dogtime.com/dog-breeds/miniature-pinscher',
 'https://dogtime.com/dog-breeds/manchester-terrier',
 'https://dogtime.com/dog-breeds/maltese',
 'https://dogtime.com/dog-breeds/norwegian-elkhound',
 'https://dogtime.com/dog-breeds/siberian-husky',
 'https://dogtime.com/dog-breeds/dutch-shepherd',
 'https://dogtime.com/dog-breeds/australian-kelpie',
 'https://dogtime.com/dog-breeds/golden-retriever',
 'https://dogtime.com/dog-breeds/pomeranian',
 'https://dogtime.com/dog-breeds/schipperke',
 'https://dogtime.com/dog-breeds/greyhound',
 'https://dogtime.com/dog-breeds/beagle',
 'https://dogtime.com/dog-breeds/papillon',
 'https://dogtime.com/dog-breeds/bernese-mountain-dog',
 'https://dogtime.com/dog-breeds/german-shepherd-dog',
 'https://dogtime.com/dog-breeds/german-shorthaired-pointer',
 'https://dogtime.com/dog-breeds/pekingese',
 'https://

In [17]:
#Comment out unneeded code based on your OS
#Mac
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

#Windows
# executable_path = {'executable_path': 'chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)

In [18]:
#intialize dataframe row lists
#source: https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
rows_list = []

#visit each link and extract rating data
for dog_link in search_list:
    #initialize list
    stars_list = []
    # Visit each dogtime url
    browser.visit(dog_link)
    
     # Scrape page into Soup
    html2 = browser.html
    soup2 = bs(html2, "lxml")
    
    dog_name_finder = soup2.find_all('div', class_= "breeds-single-content")[0]
    #find breed and append to list
    breed = dog_name_finder.find('h1').text
    stars_list.append(breed)
    
    #find area of page with details about star ratings
    check = soup2.find_all('div', class_= "characteristic-stars parent-characteristic")

    #iterate through each category and get rating
    for dog_type in check:
        #find star rating
        star = str(dog_type.find('div'))
        #add star rating to list
        stars_list.append(star)
    #add breed/star list to dataframe rows list
    rows_list.append(stars_list)

    

In [28]:
#close browser
browser.quit()

In [52]:
#create dataframe with all row lists
dog_trait_df = pd.DataFrame(rows_list)   

In [53]:
dog_trait_df

Unnamed: 0,0,1,2,3,4,5
0,Boxer,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
1,Dachshund,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>"
2,Yorkshire Terrier,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
3,Miniature Pinscher,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
4,Manchester Terrier,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
5,Maltese,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-2""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>"
6,Norwegian Elkhound,"<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
7,Siberian Husky,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>"
8,Dutch Shepherd,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"
9,Australian Kelpie,"<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-3""></div></div>","<div class=""characteristic-star-block""><div class=""star star-5""></div></div>","<div class=""characteristic-star-block""><div class=""star star-4""></div></div>"


In [54]:
#clean up ratings columns
#souce: https://datatofish.com/replace-values-pandas-dataframe/
#source: https://thispointer.com/pandas-loop-or-iterate-over-all-or-certain-columns-of-a-dataframe/#:~:text=iteritems()%20i.e.-,DataFrame.iteritems(),and%20column%20contents%20as%20series.&text=As%20there%20were%203%20columns%20so%203%20tuples%20were%20returned%20during%20iteration.
for column in dog_trait_df:
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-1"></div></div>'],1)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-2"></div></div>'],2)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-3"></div></div>'],3)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-4"></div></div>'],4)
    dog_trait_df[column] = dog_trait_df[column].replace(['<div class="characteristic-star-block"><div class="star star-5"></div></div>'],5)

In [55]:
#rename columns
dog_trait_df = dog_trait_df.rename(columns = {0:"breed",
                                             1:"adaptability",
                                             2:"all_around_friendliness",
                                             3:"health_and_grooming_needs",
                                             4:"trainability",
                                             5:"physical_needs"})

In [56]:
dog_trait_df

Unnamed: 0,breed,adaptability,all_around_friendliness,health_and_grooming_needs,trainability,physical_needs
0,Boxer,3,4,3,3,5
1,Dachshund,3,3,3,4,3
2,Yorkshire Terrier,3,3,2,3,5
3,Miniature Pinscher,3,3,3,4,4
4,Manchester Terrier,3,3,3,3,4
5,Maltese,3,4,2,3,3
6,Norwegian Elkhound,4,4,3,4,4
7,Siberian Husky,3,5,3,4,5
8,Dutch Shepherd,3,4,3,3,4
9,Australian Kelpie,3,3,3,5,4


In [25]:
#output dataframe as csv file to load into SQL/relational database
dog_trait_df.to_csv("data/dog_breed_characteristics.csv", header=True)

In [29]:
# create database connection
database_name = 'petfinder_db'
engine = create_engine(f'postgresql://{user}:{pw}@localhost:5432/{database_name}')

In [30]:
# Confirm tables
engine.table_names()

['petfinder_dogs', 'dog_links', 'dog_traits']

In [36]:
# Load data 
dog_info_df.to_sql(name='dog_links', con=engine, if_exists='append', index=False)

In [57]:
dog_trait_df.to_sql(name='dog_traits', con=engine, if_exists='append', index=False)