In [10]:
import os
import requests
import time
import pandas as pd

from random import randrange
from scrapy.selector import Selector
from tqdm import tqdm

In [11]:
def pull_breed_name(folders):
    """
    Extracts the dog breed name from the folder name
    of the dog imageset.
    """

    breeds = []
    
    for i, f in enumerate(folders):
        name = f[10:].replace('_', ' ').lower()
        breeds.append(name)
    
    return breeds   




def scrape_features(item, features):
    '''
    Searches Wikipedia's "info box" for the item passed
    and attempts to collect the information specified 
    in the feature list.
    
    Parameters
    ----------
    item : str
        Item to search Wikipedia for. 
    features : list (str)
        Information to pull from the Wikipedia info box.
    
    Returns
    -------
    python dictionary : dict[item] = features
        Returns a python dictionary where the key is the item
        passed and the values are the features found on Wikipedia.
        A zero is entered if no value is found.
    '''
    
    data = {}
    
    url = "https://en.wikipedia.org/wiki/{}".format(item)
    req = requests.get(url)
    
    # Pulling the table rows of information from the Wikipedia info box.
    rows = Selector(text=req.text).xpath('//table/tbody/tr').extract()

    # The information we need is in the tags <th> and <td>.
    # We need to iterate over the rows to pull it out.
    for i, r in enumerate(rows):
        j = Selector(text=r).xpath('//th/text()').extract()
        k = Selector(text=r).xpath('//td/text()').extract()

        # making sure we extracted data to process
        if (len(j) > 0 ) & (len(k) > 0):

            key = value_extract(j[0].lower())
            value = value_extract(k)
            
            # if info was requested adding it to data
            if key in features:
                try:
                    data[key] = value
                except:
                    print(f"    failed on : {key}")
    return data
  
    
def value_extract(value):
    '''
    Removes \n(newline) and converts \xa0(space) to 
    a space in the string.
    '''
    
    answer = ""
    
    for v in value:
        answer += v
        
    return answer.replace('\n', '').replace('\xa0', ' ')


def feature_analyzer(feature_dict, features, fail_limit):
    """
    This function analyzes if a feature was successfully 
    extracted. If no information was extracted then a 
    0 is entered in place of the information. The fail 
    limit allows us to determine if we have extracted enough
    information. If the number of successfully extracted 
    features falls below the fail limit, then rety is set 
    to True.
    
    Parameters
    ----------
    feature_dict : dictionary
        Information that needs to be analyzed.
    features : list
        Information categories desired.
    fail_limit : int
        Minimum number of features that need to be 
        extracted before information request is 
        considered successful.
    
    Returns
    -------
    True if the number of features successfully 
    extracted is greater than or equal to fail_limit
    passed. Or False if it isn't. 
    
    A processed feature dictionary with 0 added
    for missing information. 
    """
    
    retry = False
    
    # determining if we extracted enough featuers
    if len(feature_dict) <= fail_limit:
        retry = True
    
    # adding a value of 0 if the feature was not 
    # extracted properly 
    for feature in features:
        try:
            feature_dict[feature]
        except:
            feature_dict[feature] = 0
            
    return retry, feature_dict


def go(breeds, features, fail_limit):
    """
    Control function that loops through the breed list
    passing arguments to scrape_features and feature_analyzer
    returns which breeds failed to meet the fail_limit
    for the passed features. Each request in the list
    is made 5 to 15 seconds apart
    
    Parameters
    ----------
    breeds : list
        list of dog breeds to search for
    features : lsit
        list of dog traits to look for
    faile_limit
        How many features do we need before 
        breed is added to the failure list
        
    Returns
    ------
    failed : list
        list of dog breeds that didn't extract the 
        fail limit number of features
    dog : dictionary
        A dictionary of our webscraped information.
    """
    
    failed = []
    dog = {}
    
    for breed in tqdm(breeds):
        data = scrape_features(breed, features)
        retry, cln_data = feature_analyzer(data, features, fail_limit)
        
        if retry:
            # specifying to search for dog on failure
            search = breed + " (dog)"
            data = scrape_features(search, features)
            retry, cln_data = feature_analyzer(data, features, fail_limit)
            
            if retry:
                failed.append(breed)
        dog[breed] = data
        
        time.sleep(randrange(5, 15, 1))
        
    return failed, dog

In [3]:
ipath = "C:\\Users\\jatat\\Desktop\\Images"

features = ["weight", "height", "coat", "color", "life span", "common nicknames"]

imagefolders = os.listdir(ipath)

breed_names = pull_breed_name(imagefolders)

failed, dog_data = go(breed_names, features, 2)

100%|██████████| 120/120 [21:33<00:00, 10.78s/it]


In [4]:
len(failed)

55

In [5]:
df = pd.DataFrame(dog_data.values(), index = dog_data.keys())

In [6]:
df.head()

Unnamed: 0,common nicknames,weight,height,coat,color,life span
chihuahua,"Chi, Chi-chi, Hua-hua",1.8–2.7 kg (4–6 lb),15–25 cm (6–10 in),Short-haired (smooth coat) and long-coat,Any colour except merle,12 – 20
japanese spaniel,0,0,0,0,0,0
maltese dog,0,Male1.4–3.6 kg (3–8 lb),Male20–25 cm (8–10 in),white,0,12-15 years
pekinese,0,0,0,0,0,0
shih-tzu,0,Male4–7.25 kilograms (8.8–16.0 lb),Male20–28 centimetres (7.9–11.0 in),See section below,0,"10–16 years, average is 12 years old"


In [7]:
df.to_csv("C:\\Users\\jatat\\Desktop\\dogs.csv")

In [26]:
new_df = pd.read_excel("C:\\Users\\jatat\\Desktop\\dog1.xlsx")

In [33]:
new_df.head()

Unnamed: 0,breed,common nicknames,weight_kg,weight_lb,height_cm,height_in,coat,color,life span
0,chihuahua,"chi,chi-chi,hua-hua",1.8_2.7,4_6,15_25,6_10,"short,long",any,12_20
1,japanese spaniel,japanese chin,1.4_6.8,3_15,20_27,8_11,long,"white/black,white/red,tricolor",10_12
2,maltese dog,0,1.4_3.6,3_8,20_25,8_10,long,white,12_15
3,pekinese,pekingese,3.2_6.4,7_14,15_23,6_9,long,multi,12_14
4,shih-tzu,0,4_7.25,9_16,20_28,8_10,long,multi,10_16
