In [50]:
from urllib.request import urlopen, urlparse 
from urllib.error import URLError, HTTPError 
from bs4 import BeautifulSoup
from typing import List, Tuple
import regex as re 
import datetime, random, validators

In [38]:
def get_soup(url: str) -> BeautifulSoup:
    try:
        html = urlopen(url)
    except URLError:
        raise URLError(f"Server wasn't found at {url}")
    except HTTPError as err:
        if err.code == 404:
            raise HTTPError(url, 404, "Page doesn't exist")
        elif err.code == 403:
            raise HTTPError(url, 403, "Forbidden!")
        else:
            raise err 
    
    return BeautifulSoup(html, 'lxml')

In [39]:
wiki = get_soup(r"http://en.wikipedia.org/wiki/Kevin_Bacon")
for link in wiki.find_all('a'):
    if 'href' not in link.attrs: continue 
    # print(link.attrs['href'])    

How to isolate article links? 
Features of article links: 
- inside `div` with `id=bodyContent`
- URLs do not have colons -> `(?!:)`
- URLs begin with `/wiki/` -> `^(/wiki/)`
  - `()` enforce order, whereas `[]` don't

In [40]:
def filter_wiki_article_links(bs: BeautifulSoup, pattern='^(/wiki/)((?!:).)*$'):
    found = bs.find('div', attrs={'id' : 'bodyContent'})
    return found.find_all('a', href=re.compile(pattern))

for link in filter_wiki_article_links(wiki):
    if 'href' not in link.attrs: continue 
    print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia,_Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/Streaming_television
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy
/wiki/The_Guardian
/wi

Avoid crawling the same page twice = keep discovered links formatted consistently and store in running **set**
`s = set(); s.add(...)`

In [41]:
pages = set()
def getLinks(url: str, limit=5):
    global pages 
    bs = get_soup('http://en.wikipedia.org/' + url)
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        
        if len(pages) >= limit: break
        
        if not 'href' in link.attrs: continue 
        
        next_link = link.attrs['href']
        if next_link in pages: continue 
        
        print(next_link)
        pages.add(next_link)
        
        getLinks(next_link)
    
getLinks('')

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Protection_policy#extended


In [42]:
pages = set()
def getLinks2(url: str, limit=5):
    global pages 
    bs = get_soup('http://en.wikipedia.org/' + url)
    try:
        print(
            bs.h1.get_text(),
            bs.find(id='mw-content-text').find_all('p')[0],
            bs.find(id='ca-edit').find('span')
                .find('a').attrs['href']
        )
    except AttributeError:
        print("This page is missing something. Continuing")
        
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        
        if len(pages) >= limit: break 
        
        if not 'href' in link.attrs: continue 
        
        next_link = link.attrs['href']
        if next_link in pages: continue 
        
        print(next_link)
        pages.add(next_link)
        getLinks2(next_link)
    
getLinks2('')

This page is missing something. Continuing
/wiki/Wikipedia
This page is missing something. Continuing
/wiki/Wikipedia:Protection_policy#semi
This page is missing something. Continuing
/wiki/Wikipedia:Requests_for_page_protection
This page is missing something. Continuing
/wiki/Wikipedia:Requests_for_permissions
This page is missing something. Continuing
/wiki/Wikipedia:Protection_policy#extended
This page is missing something. Continuing


Redirects = point domain name or URL to different location
i.e. the page being crawled may not be the page that was loaded (URLs don't match)

Types
- server side = URL is changed before page loads
- client side = URL chagnes after page loads

Handling
- `urllib` handles redirects automatically
- `requests` requires `allow_redirects=True` flag

## Searching an entire webpage

In [45]:
random.seed(datetime.datetime.now())

def getInternalLinks(bs: BeautifulSoup, includeUrl: str, limit: int) -> List[str]:
    
    includeUrl = '{}://{}'.format(
        urlparse(includeUrl).scheme,
        urlparse(includeUrl).netloc
    )
    
    internalLinks = set()
    
    # all links beginning with / 
    pattern = f"^(/|.*{includeUrl})"
    for link in bs.find_all('a', href=re.compile(pattern)):
        
        if len(internalLinks) >= limit: break 
        
        if not link.attrs['href']: continue 
        next_link = link.attrs['href']
        if next_link in internalLinks: continue 
        
        if next_link.startswith('/'): 
            internalLinks.add(includeUrl + next_link)
        else:
            internalLinks.add(next_link)
    
    return list(internalLinks)

def getExternalLinks(bs: BeautifulSoup, excludeUrl: str, limit: int) -> List[str]:
    
    externalLinks = set()
    
    # start -> http or www -> not excludeUrl -> anything -> end 
    pattern = f"^(http|www)((?!{excludeUrl}).)*$"
    
    for link in bs.find_all('a', href=re.compile(pattern)):
        
        if len(externalLinks) >= limit: break 
        
        if not link.attrs['href']: continue 
        next_link = link.attrs['href']
        if next_link in externalLinks: continue 
        
        externalLinks.add(next_link)
        
    return list(externalLinks)

def getRandomLink(startingPage: str, finder, **kwargs):
    bs = get_soup(startingPage)
    
    foundLinks = finder(bs, urlparse(startingPage).netloc, **kwargs)
    
    if len(foundLinks) == 0:
        print("No links were found")
        return startingPage
    
    return foundLinks[random.randint(0, len(foundLinks)-1)]


In [109]:
class InvalidLinkMessage():
    def __init__(self, startURL: str, foundURL: str) -> None:
        print(f"{foundURL} is not a valid link. Next search from: {startURL}.")
        
def goUpNDirectories(url: str, num: int) -> str:
    """Move up `num` 'slashes' from the current page, if possible"""
    parsed = urlparse(url)
    return url.netloc + parsed.path.rsplit('/', 1)[0]
    
def iterateUnidirectionalSearch(startingSite: str, finder, **kwargs):
    
    link = getRandomLink(startingSite, finder, **kwargs)
    
    if len(link) < 4:
        InvalidLinkMessage(startingSite, link)
        return startingSite
    
    if link[:4] == r":///":
        if startingSite[-1] != "/": 
            link = f"{startingSite}/{link[4:]}"
        else:
            link = startingSite + link[4:]
        
    if not validators.url(link): 
        InvalidLinkMessage(startingSite, link)
        
        return goUpNDirectories(link, 1)
        
    print(f"Random internal link is: {link}")
    return link 
    
def unidirectionalSearch(max_iterations: int, startingSite: str, *args, **kwargs):
    
    iters = 0 
    link = startingSite
    while iters < max_iterations:
        link = iterateUnidirectionalSearch(link, *args, **kwargs)
        iters += 1 
        
    print(f"{max_iterations} iterations completed.")

In [110]:
unidirectionalSearch(10, 'http://www.stackoverflow.com', getExternalLinks, limit=5)

Random internal link is: https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f
Random internal link is: https://stackexchange.com/sites
Random internal link is: https://stackoverflow.blog
Random internal link is: https://stackoverflow.com/users/email/settings/current?__hstc=188987252.a2a4dcb0f856fe0e4513d47d380b0d26.1568193729712.1580289678034.1580299209983.129&__hssc=188987252.1.1580299209983&__hsfp=1774503237
Random internal link is: https://stackoverflow.blog
Random internal link is: https://twitter.com/stackoverflow
No links were found
Random internal link is: https://twitter.com/stackoverflow
No links were found
Random internal link is: https://twitter.com/stackoverflow
No links were found
Random internal link is: https://twitter.com/stackoverflow
No links were found
Random internal link is: https://twitter.com/stackoverflow
10 iterations completed.


# Web scraping models
Approach
- identify what information is needed first; may not be on a given site
- decoupled design for when multiple websites use different instructions, but similar content
  - `Content` class to hold relevant content
  - `Website` class to hold instructions on how to make `BeautifulSoup` object (tag, attributes)
  - `Crawler` that opens a URL, opens a `Website`, and then creates `Content`

## crawler structures
### searcher
- when searching a site, many sites store search argument in URL, e.g. `.com?search=[...]`. The prefix can be saved as a property in `Website` so that search pages can be saved in terms of just the `[...]` part
- search results are usually a list such as `<span class='result'>`, which can also be stored in `Website`
- important to know whether links on search results are absolute or relative -> this can be stored in `Website`

### through links
- unlike searcher, need to provide `Website` with target pattern and bool of whether links are absolute or relative
- `Crawler` parses each internal link found 

### multiple page types 
- use URL, presence/lack of certain fields (e.g. date, product image), or tags to classify type of page
- for many different web sites that have common properties, might make subclass of `Website`, e.g. for article pages vs product pages

In [None]:
class WebsiteBaseClass:
    def __init__(self, name: str, url: str, 
                titleTag: str, bodyTag: str, pageType: str) -> None:
        
        self.name = name 
        self.url = url 
        self.titleTag = titleTag 
        self.bodyTag = bodyTag 
        self.pageType = pageType 
        
class Article(WebsiteBaseClass):
    def __init__(self, name: str, url: str, titleTag: str, bodyTag: str, dateTag: str, pageType: str) -> None:
        super().__init__(name, url, titleTag, bodyTag, pageType)
        self.dateTag = dateTag 
    
"""
class Product(WebsiteBaseClass):
    def __init__(self, name: str, url: str, titleTag: str, bodyTag: str, priceTag: str, pageType: str) -> None:
        super().__init__(name, url, titleTag, bodyTag, pageType)
        self.priceTag = priceTag 
"""

# scrapy

## basics
- spider = Scrapy project
- 