In [None]:
import urllib.request as ul #url.request lib for handling the url
from bs4 import BeautifulSoup #bs for parsing the page

url = "http://www.slate.com"

#Do stuff necessary to get the page text into a string
url_response=ul.urlopen(url,timeout=5)



soup = BeautifulSoup(url_response) #Soup stores the data in a structured way to make retrieval easy
#Soup also automatically decodes the page correctly (most of the time!)

print(soup.prettify()) #Prints page contents 

In [None]:
#Soup stores the page as a dictionary with tags for retrieval. 
#For example, the first div tag on the page is:
soup.div

In [None]:
#Or, to get the first link out of the page
soup.a

In [None]:
#To find all links on the page
soup.find_all('a')

In [None]:
links=soup.find_all('a')
print(links)

In [None]:
#Can we get all the links from finance.google.com using bs4?
#Try it here

In [None]:
#We can search for specific things in the list returned by find_all
for link in soup.find_all('a'): #pick each link in turn
    h_link=link.get('href') #Get the href property of the link
    if h_link and 'articles' in h_link: #See if the word 'articles' is in the link
        print(h_link)

In [None]:
#Passing True to find_all gets all the tags in the document
for tag in soup.find_all(True):
    print(tag.name)

In [None]:
#That's a long list. Let's see if we can get unique tag names from the document
#Sets contain only unique elements so let's use a set
#We can get the set of tags (without duplicates) as follows
y=set()
for tag in soup.find_all(True):
    #print(tag.name)
    y.add(tag.name) #add adds an item to a set - if it is not already in the set
print(y)

In [None]:
#Hmm. There is a tag called 'article'. Wonder what that is about?
soup.article

In [None]:
#Slate apparently uses the article tag to identify content.
#Wonder if there are more. If not, then this will contain all the material we need
len(soup.find_all('article'))

In [None]:
#It is the only one. Let's work with it
article_soup=soup.article #soup was our original document. article_soup is an extract from that document
print(article_soup)

In [None]:
#So let's see if we can find articles with a particular name in the link
search_name = 'trump'
found_articles = list()
for link_tag in article_soup.find_all('a'):
    link = link_tag.get('href')
    if link and search_name in link.lower(): #Because some a tags may return empty href elements. in won't work on those
        found_articles.append(link)
import pprint
pprint.pprint(found_articles)

In [None]:
url_2='http://www.google.com/finance/market_news'


In [None]:
#has_attr tells whether a tag has a particular attribute or not
for link in article_soup.find_all('a'):
    if link.has_attr('class'):
        print(link)

In [None]:
#To check if an attribute has a certain value, use an equal sign for the attribute
article_soup.find_all('a',class='primary')

In [None]:
#Unfortunately, class is a reserved word (it means something) in python
#So, put an _ after class and it will work!
article_soup.find_all('a',class_='primary')

In [None]:
#Now we can get exactly what we want
for link in article_soup.find_all('a',class_='primary'):
    h_link=link.get('href')
    if h_link in h_link:
        print(h_link)    

In [None]:
#We can now functionalize the code so that given a word, we get a list of all the articles that have that
#word in the url

def find_links(search_term,url):
    import urllib.request as ur
    from bs4 import BeautifulSoup
    soup_stuff = BeautifulSoup(ur.urlopen(url))
    found_links=list()
    for link in soup_stuff.find_all('a'):
        try:
            
            if link and search_term in link.get('href') and not link.get('href') in found_links:
                found_links.append(link.get('href'))
        except:
            continue
    return found_links
find_links('trump','http://www.slate.com')

In [None]:
#Another useful thing you can do with soup
#Limit the number of responses. Useful if you need to control the search
#Especially important when reading a large page and you only want a sampling
article_soup.find_all('a',class_='primary',limit=5)

In [None]:
#Do it here

In [None]:
#Functionalizing the search so that we can search any url for a search term

def find_titles(search_term,url):
    import urllib.request as ur
    from bs4 import BeautifulSoup
    soup_stuff = BeautifulSoup(ur.urlopen(url))
    found_links=list()
    for link in soup_stuff.find_all('a'):
        try:
            if link and search_term in link.get_text().lower() and not link.get_text() in found_links:
                found_links.append(link)
        except:
            continue
    return found_links


In [None]:
find_titles('clinton','http://news.yahoo.com')