In [2]:
# Instructor Turn Activity Soup Intro
# Dependencies
from bs4 import BeautifulSoup as bs

In [3]:
html_string = """
<html>
<head>
<title>
A Simple HTML Document
</title>
</head>
<body>
<p>This is a very simple HTML document</p>
<p>It only has two paragraphs</p>
</body>
</html>
"""

In [4]:
# Create a Beautiful Soup object
soup = bs(html_string, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [5]:
# Print formatted version of the soup
print(soup.prettify())

<html>
 <head>
  <title>
   A Simple HTML Document
  </title>
 </head>
 <body>
  <p>
   This is a very simple HTML document
  </p>
  <p>
   It only has two paragraphs
  </p>
 </body>
</html>



In [6]:
# Extract the title of the HTML document
soup.title

<title>
A Simple HTML Document
</title>

In [7]:
# Extract the text of the title
soup.title.text

'\nA Simple HTML Document\n'

In [8]:
# Clean up the text
soup.title.text.strip()

'A Simple HTML Document'

In [9]:
# Extract the contents of the HTML body
soup.body

<body>
<p>This is a very simple HTML document</p>
<p>It only has two paragraphs</p>
</body>

In [10]:
# Extract the text of the body
soup.body.text

'\nThis is a very simple HTML document\nIt only has two paragraphs\n'

In [11]:
# Text of the first paragraph
soup.body.p.text

'This is a very simple HTML document'

In [12]:
# Extract all paragraph elements
soup.body.find_all('p')

[<p>This is a very simple HTML document</p>, <p>It only has two paragraphs</p>]

In [13]:
# Extract paragraph by index
soup.body.find_all('p')[0]

<p>This is a very simple HTML document</p>

In [14]:
soup.body.find_all('p')[1]

<p>It only has two paragraphs</p>

In [15]:
# The text of the first paragraph
soup.body.find('p').text

'This is a very simple HTML document'

# Students Turn A Soup Starter

## Instructions

* Believe it or not, CNN's website for **1996: Year in Review** is still alive on the web: <http://edition.cnn.com/EVENTS/1996/year.in.review/>

* We have, however, stored the HTML document as a string in your starter file.

* Your task, should you accept it (and you should), is to use Beautiful Soup to scrape and print the following pieces of information:

1. The **title**

2. All **paragraph** texts

3. The top 10 headlines (warning: this one is a bit tricky, and may not come out perfectly!)

## Hints

* For the third task, you will need a means of filtering the data, perhaps over multiple iterations.

## Bonus

* If you finish early, head over to the [Beautful Soup documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to read up on accessing `attributes` and navigating the DOM.

In [16]:
# Dependencies
import os
from bs4 import BeautifulSoup as bs

In [17]:
# Read HTML from file
filepath = os.path.join(".", "Resources", "template.html")
with open(filepath) as file:
    html = file.read()

In [19]:
# Create a Beautiful Soup object
soup2 = bs(html, 'html.parser')
type(soup2)
print(soup2.prettify())

<html>
 <head>
  <title>
   Top Ten Stories From 1996
  </title>
 </head>
 <body alink="#FFFFCE" bgcolor="#FFFFCC" link="#162323" vlink="#162323">
  <center>
   <p>
    <br/>
    <table border="0" cellpadding="0" cellspacing="0">
     <tr>
      <td>
       <img align="TOP" height="60" src="logos.gif" width="112"/>
      </td>
      <td>
       <img align="TOP" height="60" src="banner.gif" width="360"/>
      </td>
     </tr>
    </table>
   </p>
  </center>
  <blockquote>
   <center>
    <table border="0" cellpadding="2">
     <tr>
      <td rowspan="11" valign="TOP" width="90">
       <p align="RIGHT">
        <b>
         <tt>
          What were the biggest stories of the year?
         </tt>
        </b>
        <br/>
        <br/>
        <font size="2">
         It's a question journalists like to ask themselves at the end of every
                year. Now you can join in the process. Here are our selections for the top ten news
                stories of 1996.
         <br/>
 

In [20]:
# Extract title text
soup2.title

<title>Top Ten Stories From 1996</title>

In [35]:
# Print all paragraph texts
paragraphs = soup2.find_all('p')
for item in paragraphs:
    print(item.text.strip())


What were the biggest stories of the year?

It's a question journalists like to ask themselves at the end of every
                year. Now you can join in the process. Here are our selections for the top ten news
                stories of 1996.

                Disagree with our choices? Then tell us what stories you think were most compelling
                in the poll below.

What makes a big
                story BIG?

It depends on your criteria, of course, and your perspective. That's why we offered
                a poll to find out what you think.
For our list, we polled producers throughout the CNN/Pathfinder family of networks
                and publications, and weighed such criteria as a story's long-term implications,
                geopolitical significance, user interest, amount of coverage, and old-fashioned newsworthiness.
                All these things help make a "big" story big.
By no means do we think our lists are the final word. Even our polls among CNN
 

In [26]:
# Print all ten headlines
tds = soup2.find_all("td")
headlines = []
for td in tds:
    if(td.a):
        if(td.a.text):
            headlines.append(td)

In [31]:
# Print only the headlines
for item in range(len(headlines)):
    print(headlines[item].text)

Israel elects Netanyahu
Crash of TWA Flight 800
Russia elects Yeltsin
U.S. elects Clinton
Hutu-Tutsi conflict in central Africa
Peace, elections in Bosnia
U.S. base bombed in Saudi Arabia
Centennial Olympic Games
Advances against AIDS
Unabomb suspect Ted Kaczynski arrested
The top 10 stories according to our users
Tell us what you think
You said it...


# Instructor Turn Activity 3 

In [36]:
# Dependencies
from bs4 import BeautifulSoup
import requests

In [37]:
# URL of page to be scraped
url = 'https://newjersey.craigslist.org/search/sss?sort=rel&query=guitar'

In [38]:
# Retrieve page with the requests module
response = requests.get(url)

In [39]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [40]:
# Examine the results, then determine element that contains sought info
print(soup.prettify())

﻿
<!DOCTYPE html>
<html class="no-js">
 <head>
  <title>
   north jersey for sale "guitar" - craigslist
  </title>
  <meta content='north jersey for sale "guitar" - craigslist' name="description"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible">
   <link href="https://newjersey.craigslist.org/search/sss" rel="canonical"/>
   <link href="https://newjersey.craigslist.org/search/sss?format=rss&amp;query=guitar&amp;sort=rel" rel="alternate" title='RSS feed for craigslist | north jersey for sale "guitar" - craigslist' type="application/rss+xml"/>
   <link href="https://newjersey.craigslist.org/search/sss?s=120&amp;query=guitar&amp;sort=rel" rel="next"/>
   <meta content="width=device-width,initial-scale=1" name="viewport"/>
   <link href="//www.craigslist.org/styles/cl.css?v=38ee99f34dcd77a615cd2d3c32114559" media="all" rel="stylesheet" type="text/css"/>
   <link href="//www.craigslist.org/styles/search.css?v=84cf86bc094026e12fa066bbbab154ac" media="all" rel="stylesheet" type="text/

# Students Turn Activity 4
# Reddit Scraper

## Instructions

* In this activity, you will scrape the Programmer-Humor.html file provided

* Use Beautiful Soup to scrape only threads that have two or more comments, then print the thread's title, number of comments, and the URL to the thread.

## Bonus

* If you finish early, try to display each thread's top comment in your output!

In [71]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import os

In [72]:
# URL of Python reddit
url = 'https://www.reddit.com/r/Python/'
url = 'https://www.reddit.com/r/ProgrammingHumor/'

In [73]:
# Retrieve page with the requests module
html = requests.get(url,headers={'User-Agent':'test'})
print(html)

<Response [200]>


In [74]:
# filepath = os.path.join('.', 'Resources', "Programmer-Humor.html")
# with open(filepath, encoding = 'utf8') as file:
#     html = file.read()

In [76]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Whatever in regards to programming/computing that is somehow funny.
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <style>
   /* http://meyerweb.com/eric/tools/css/reset/
    v2.0 | 20110126
    License: none (public domain)
  */

  html, body, div, span, applet, object, iframe,
  h1, h2, h3, h4, h5, h6, p, blockquote, pre,
  a, abbr, acronym, address, big, button, cite, code,
  del, dfn, em, img, input, ins, kbd, q, s, samp,
  small, strike, strong, sub, sup, tt, var,
  b, u, i, center,
  dl, dt, dd, ol, ul, li,
  fieldset, form, label, legend,
  table, caption, tbody, tfoot, thead, tr, th, td,
  article, aside, canvas, details, embed,
  figure, figcaption, footer, header, hgroup,
  menu, nav, output, ruby, section, summary,
  time, mark, audio, video {
    margin: 0;
    padding: 0;
    border: 0;
    font-size: 100%;
    font: inherit;
    vertical-align: baseline;
 

In [77]:
# Find the number of subscribers
number_subscribers = soup.find("p",class_="s8wuzb8-12 jWHDlZ")
print(number_subscribers.text)

3.1k


In [81]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all("div",class_="")
print(results)

[<div id="2x-container"><div class="b8sij2-0 jsUmpv" data-reactroot="" theme="[object Object]"><div><div></div></div><div tabindex="-1"><div class="_1gsAk1ihQliBnDybgyjghy" id="SHORTCUT_FOCUSABLE_DIV" tabindex="-1"><div><div><header class="_2GyPfdsi-MbQFyHRECo9GO s1r1zaa-0 AwBkR s60wwc0-0 bLxhYn" data-redditstyle="true"><div class="_3dnbqz69WJTFCss8wl7Wlk"><span class="s1dqr9jy-0 imyGpC">Press J to jump to the feed. Press question mark to learn the rest of the keyboard shortcuts</span><a aria-label="Home" class="_30BbATRhFv3V83DHNDjJAO" href="/"><svg class="_1O4jTk-dZ-VIxsCuYB6OR8" viewbox="0 0 20 20" xmlns="http://www.w3.org/2000/svg"><g><circle cx="10" cy="10" fill="#FF4500" r="10"></circle><path d="M16.67,10A1.46,1.46,0,0,0,14.2,9a7.12,7.12,0,0,0-3.85-1.23L11,4.65,13.14,5.1a1,1,0,1,0,.13-0.61L10.82,4a0.31,0.31,0,0,0-.37.24L9.71,7.71a7.14,7.14,0,0,0-3.9,1.23A1.46,1.46,0,1,0,4.2,11.33a2.87,2.87,0,0,0,0,.44c0,2.24,2.61,4.06,5.83,4.06s5.83-1.82,5.83-4.06a2.87,2.87,0,0,0,0-.44A1.46,1.46,

In [82]:
# Loop through returned results
for result in results:
    # Retrieve the thread title
    title = result.find('p',class_="title")
    
    # Access the thread's text content
    title_text = title.a.text
    try:
        # Access the thread with CSS selectors
        thread = result.find("li",class_="first")
        # The number of comments made in the thread
        comments = thread.text.lstrip()
        # Parse string, e.g. '47 comments' for possible numeric manipulation
        comments_num = int(comments.split()[0])
        # Access the href attribute with bracket notation
        link = thread.a['href']
        # Run if the thread has comments
        if (comments_num):
            print('\n-----------------\n')
            print(title_text)
            print('Comments:', comments_num)
            print(link)
    except AttributeError as e:
        print(e)

AttributeError: 'NoneType' object has no attribute 'a'

# Instructor Turn Activity 5 Mongo Scraping

In [83]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [84]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [85]:
# Define database and collection
db = client.craigslist_db
collection = db.items

In [86]:
# URL of page to be scraped
url = 'https://newjersey.craigslist.org/search/sss?sort=rel&query=guitar'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'html.parser')

In [87]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('li', class_='result-row')

# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of listing
        title = result.find('a', class_='result-title').text
        # Identify and return price of listing
        price = result.a.span.text
        # Identify and return link to listing
        link = result.a['href']

        # Run only if title, price, and link are available
        if (title and price and link):
            # Print results
            print('-------------')
            print(title)
            print(price)
            print(link)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'price': price,
                'url': link
            }

            collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
2 or Pair of On Stage Music Intrument Guitar Stands
$15
https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6639085733.html
-------------
2 or Pair of On Stage Music Intrument Guitar Stands
$15
https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6640945098.html
-------------
Digitech GSP 7 Guitar Effects Processor
$100
https://newjersey.craigslist.org/msg/d/digitech-gsp-7-guitar-effects/6624260076.html
'NoneType' object has no attribute 'text'
-------------
Epiphone Riviera Electric Guitar
$400
https://newjersey.craigslist.org/msg/d/epiphone-riviera-electric/6647907210.html
-------------
Vintage 1973 Led Zeppelin houses of the holy guitar tab book
$40
https://newjersey.craigslist.org/msg/d/vintage-1973-led-zeppelin/6647900696.html
-------------
GUITAR CASES FOR SALE
$125
https://newjersey.craigslist.org/msg/d/guitar-cases-for-sale/6647888891.html
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
-------------

In [88]:
# Display items in MongoDB collection
listings = db.items.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5b514d2c7a0c9a13500bf201'), 'title': '2 or Pair of On Stage Music Intrument Guitar Stands', 'price': '$15', 'url': 'https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6639085733.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf202'), 'title': '2 or Pair of On Stage Music Intrument Guitar Stands', 'price': '$15', 'url': 'https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6640945098.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf203'), 'title': 'Digitech GSP 7 Guitar Effects Processor', 'price': '$100', 'url': 'https://newjersey.craigslist.org/msg/d/digitech-gsp-7-guitar-effects/6624260076.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf204'), 'title': 'Epiphone Riviera Electric Guitar', 'price': '$400', 'url': 'https://newjersey.craigslist.org/msg/d/epiphone-riviera-electric/6647907210.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf205'), 'title': 'Vintage 1973 Led Zeppelin houses of the holy guitar tab book', 'price': '$40', 'url': 'https://ne

# Students Turn Activity 6 Hockey Headers

Teamwork! Speed! Mental and physical toughness! Passion! Excitement! Unpredictable matchups down to the wire! What could be better? While these terms could easily be applied to a data science hackathon, we're talking about the magnificent sport of hockey.

Your assignment is to scrape the articles on the front page of the NHL website - which is frequently updated - and then post the results of your scraping to MongoDB.

## Instructions

* Use Beautiful Soup and requests to scrape the title and heading of each article on the front page.

* Post the above information as a MongoDB document and then print all of the documents on the database to the console.

* In addition to the above, post the time and date of the article publication as well.

In [92]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [93]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [94]:
# Define database and collection
db = client.nhl_db
collection = db.items

In [95]:
# URL of page to be scraped
url = 'https://www.nhl.com/'

In [96]:
# Retrieve page with the requests module
response = requests.get(url)

In [97]:
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text,'lxml')

In [101]:
# Retrieve the parent divs for all articles
results = soup.find_all('li',class_="mixed-feed__item--article")
# Loop through results to retrieve article title, header, and timestamp of article
for result in results:
    title = result.find('h4', class_='mixed-feed__header').text

    lede = result.find('h5', class_='mixed-feed__subheader').text

    # The time and date of article publication
    date = result.find('time')["datetime"]
    # Slice the datetime string for the date
    article_date = date[:10]
    # Slice the datetime string for the time
    time = date[11:16]
    # Determine whether article was published in AM or PM
    if (int(time[:2]) >= 13):
        meridiem = ' pm'
    else:
        meridiem = ' am'
    meridiem = ''
    # Concatenate time string
    time = time + meridiem
    print('-----------------')
    print(title)
    print(lede)
    print(article_date)
    print(time)

    # Dictionary to be inserted into MongoDB
    post = {
        "title":title,
        "lede":lede,
        "date":article_date,
        "time_published":time
    }
    
    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)

-----------------
Panarin sets deadline in contract talks with Blue Jackets: report
Forward won't sign extension once training camp begins in order to focus on hockey, agent says
2018-07-19
13:44
-----------------
Ovechkin wins Best Male Athlete at ESPYs, first NHL player to do so
Capitals captain honored after Stanley Cup win, 600th goal
2018-07-18
22:45
-----------------
Donato learning about offseason commitment from Bruins veterans
Forward prospect began workouts shortly after playoff elimination
2018-07-20
00:00
-----------------
Beaudin determined to keep building two-way game for Blackhawks
Defenseman prospect working to improve physical presence
2018-07-20
00:00
-----------------
Anderson could use versatility to make Devils roster
Forward prospect eager to start 'climbing up the ranks'
2018-07-20
00:00
-----------------
NHL Free Agent Tracker
Complete list of signings by team, available players
2018-07-19
17:37
-----------------
Lowry agrees to three-year contract with Jets
Fo

In [105]:
# Display the MongoDB records created above
articles = db.items.find()
for article in articles:
    print(article)

{'_id': ObjectId('5b5158547a0c9a13500bf268'), 'title': 'Panarin sets deadline in contract talks with Blue Jackets: report', 'lede': "Forward won't sign extension once training camp begins in order to focus on hockey, agent says", 'date': '2018-07-19', 'time_published': '13:44pm'}
{'_id': ObjectId('5b5158557a0c9a13500bf269'), 'title': 'Ovechkin wins Best Male Athlete at ESPYs, first NHL player to do so', 'lede': 'Capitals captain honored after Stanley Cup win, 600th goal', 'date': '2018-07-18', 'time_published': '22:45pm'}
{'_id': ObjectId('5b5158557a0c9a13500bf26a'), 'title': 'Donato learning about offseason commitment from Bruins veterans', 'lede': 'Forward prospect began workouts shortly after playoff elimination', 'date': '2018-07-20', 'time_published': '00:00am'}
{'_id': ObjectId('5b5158557a0c9a13500bf26b'), 'title': 'Beaudin determined to keep building two-way game for Blackhawks', 'lede': 'Defenseman prospect working to improve physical presence', 'date': '2018-07-20', 'time_publ

# Instructor Turn Activity 7

In [102]:
!pip install splinter

Collecting splinter
  Downloading https://files.pythonhosted.org/packages/ed/de/429f0f97703289cc5285bd0616c787b6a84fbf172bc4a3e73b99a5c8b352/splinter-0.8.0.tar.gz
Collecting selenium>=3.8.1 (from splinter)
  Downloading https://files.pythonhosted.org/packages/41/c6/78a9a0d0150dbf43095c6f422fdf6f948e18453c5ebbf92384175b372ca2/selenium-3.13.0-py2.py3-none-any.whl (946kB)
Building wheels for collected packages: splinter
  Running setup.py bdist_wheel for splinter: started
  Running setup.py bdist_wheel for splinter: finished with status 'done'
  Stored in directory: C:\Users\KashS\AppData\Local\pip\Cache\wheels\3a\85\ba\909cadf37974d51fb83ce938ffe4651c49120afe55adad6dac
Successfully built splinter
Installing collected packages: selenium, splinter
Successfully installed selenium-3.13.0 splinter-0.8.0


In [106]:
from splinter import Browser
from bs4 import BeautifulSoup

# Mac User

In [None]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [None]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# PC User

In [107]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [108]:
url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [109]:
for x in range(1, 6):

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    quotes = soup.find_all('span', class_='text')

    for quote in quotes:
        print('page:', x, '-------------')
        print(quote.text)

    browser.click_link_by_partial_text('Next')

page: 1 -------------
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
page: 1 -------------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
page: 1 -------------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
page: 1 -------------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
page: 1 -------------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
page: 1 -------------
“Try not to become a man of success. Rather become a man of value.”
page: 1 -------------
“It is better to be hated for what you are than to be loved for what you are not.”
page: 1 -------------
“I have not failed. I've just found 10,000 ways that won't work.”
page: 1 -------------
“A woman is like a tea bag; you ne

# Students Turn Activity 8
# Bookscraper

## Instructions

* Go to <http://books.toscrape.com/>

* Scrape the titles and the URLs to all books on this fictional online bookstore. Display the results in console.

* That's it!

* If you're craving extra challenge, try scraping all books by **category**. Good luck!

In [131]:
from splinter import Browser
from bs4 import BeautifulSoup

# Mac Users

In [None]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [None]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# PC Users

In [128]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [129]:
url = 'http://books.toscrape.com/'
browser.visit(url)

In [134]:
url = 'http://books.toscrape.com/'
browser.visit(url)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

sidebar = soup.find('ul', class_='nav-list')
categories = sidebar.find_all('li')

category_list = []
url_list = []
book_url_list = []

for category in categories:
    title = category.text.strip()
    category_list.append(title)
    book_url = category.find('a')['href']
    url_list.append(book_url)
    
book_url_list = ['http://books.toscrape.com/' + url for url in url_list]
titles_and_urls = zip(category_list, book_url_list)

try:
    for title_url in titles_and_urls:
        browser.click_link_by_partial_text('next')
except Exception:
    print('Scraping Completed')

Scraping Completed


In [136]:
for title_url in titles_and_urls:
    print(title_url)

In [137]:
book_url_list

['http://books.toscrape.com/catalogue/category/books_1/index.html',
 'http://books.toscrape.com/catalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html',
 'http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html',
 'http://books.toscrape.com/catalogue/category/books/romance_8/index.html',
 'http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html',
 'http://books.toscrape.com/catalogue/category/books/fiction_10/index.html',
 'http://books.toscrape.com/catalogue/category/books/childrens_11/index.html',
 'http://books.toscrape.com/catalogue/category/books/religion_12/index.html',
 'http://books.toscrape.com/catalogue/category/books/nonfictio

# Instructor Turn Activity 9 
# Scraping with Pandas

In [138]:
import pandas as pd

###### We can use the read_html function in Pandas to automatically scrape any tabular data from a page.

In [139]:
url = 'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States'

In [140]:
tables = pd.read_html(url)
tables

[                                           0   \
 0                                       State   
 1   Municipal (Within city proper boundaries)   
 2                                     Alabama   
 3                                      Alaska   
 4                                     Arizona   
 5                                    Arkansas   
 6                                  California   
 7                                    Colorado   
 8                                 Connecticut   
 9                                    Delaware   
 10                                    Florida   
 11                                    Georgia   
 12                                     Hawaii   
 13                                      Idaho   
 14                                   Illinois   
 15                                    Indiana   
 16                                       Iowa   
 17                                     Kansas   
 18                                   Kentucky   


In [141]:
type(tables)

list

In [142]:
df = tables[0]
df.columns = ['State', 'Abr.', 'State-hood Rank', 'Capital', 
              'Capital Since', 'Area (sq-mi)', 'Municipal Population', 'Metropolitan', 
              'Metropolitan Population', 'Population Rank', 'Notes']
df.head()

Unnamed: 0,State,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
0,State,Abr.,State-hood,Capital,Capital since,Area (mi²),Population (2010),Notes,,,
1,Municipal (Within city proper boundaries),Metropolitan (Both within the capital city pro...,Rank in state,Rank in US,,,,,,,
2,Alabama,AL,1819,Montgomery,1846,155.4,205764,374536,2.0,115.0,Birmingham is the state's largest city.
3,Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
4,Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887,1.0,5.0,Most populous U.S. state capital and the only ...


In [143]:
df = df.iloc[2:]
df.head()

Unnamed: 0,State,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
2,Alabama,AL,1819,Montgomery,1846,155.4,205764,374536.0,2.0,115.0,Birmingham is the state's largest city.
3,Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
4,Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887.0,1.0,5.0,Most populous U.S. state capital and the only ...
5,Arkansas,AR,1836,Little Rock,1821,116.2,193524,877091.0,1.0,117.0,
6,California,CA,1850,Sacramento,1854,97.2,466488,2527123.0,6.0,35.0,Supreme Court of California is headquartered i...


In [144]:
df.set_index('State', inplace=True)
df.head()

Unnamed: 0_level_0,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,AL,1819,Montgomery,1846,155.4,205764,374536.0,2.0,115.0,Birmingham is the state's largest city.
Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887.0,1.0,5.0,Most populous U.S. state capital and the only ...
Arkansas,AR,1836,Little Rock,1821,116.2,193524,877091.0,1.0,117.0,
California,CA,1850,Sacramento,1854,97.2,466488,2527123.0,6.0,35.0,Supreme Court of California is headquartered i...


In [145]:
df.loc['Alabama']

Abr.                                                            AL
State-hood Rank                                               1819
Capital                                                 Montgomery
Capital Since                                                 1846
Area (sq-mi)                                                 155.4
Municipal Population                                        205764
Metropolitan                                                374536
Metropolitan Population                                          2
Population Rank                                                115
Notes                      Birmingham is the state's largest city.
Name: Alabama, dtype: object

DataFrames as HTML
Pandas also had a to_html method that we can use to generate HTML tables from DataFrames.

In [146]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Abr.</th>\n      <th>State-hood Rank</th>\n      <th>Capital</th>\n      <th>Capital Since</th>\n      <th>Area (sq-mi)</th>\n      <th>Municipal Population</th>\n      <th>Metropolitan</th>\n      <th>Metropolitan Population</th>\n      <th>Population Rank</th>\n      <th>Notes</th>\n    </tr>\n    <tr>\n      <th>State</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Alabama</th>\n      <td>AL</td>\n      <td>1819</td>\n      <td>Montgomery</td>\n      <td>1846</td>\n      <td>155.4</td>\n      <td>205764</td>\n      <td>374536</td>\n      <td>2.0</td>\n      <td>115.0</td>\n      <td>Birmingham is the state\'s largest city.</td>\n    </tr>\n    <tr>\n      <th>Alaska</th>\n      <td>AK</td>\n 

In [147]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Abr.</th>      <th>State-hood Rank</th>      <th>Capital</th>      <th>Capital Since</th>      <th>Area (sq-mi)</th>      <th>Municipal Population</th>      <th>Metropolitan</th>      <th>Metropolitan Population</th>      <th>Population Rank</th>      <th>Notes</th>    </tr>    <tr>      <th>State</th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Alabama</th>      <td>AL</td>      <td>1819</td>      <td>Montgomery</td>      <td>1846</td>      <td>155.4</td>      <td>205764</td>      <td>374536</td>      <td>2.0</td>      <td>115.0</td>      <td>Birmingham is the state\'s largest city.</td>    </tr>    <tr>      <th>Alaska</th>      <td>AK</td>      <td>1959</td>      <td>Juneau</td>      <td>1906</td>      <td>2716.7</td>      <td>312

In [148]:
df.to_html('table.html')

In [None]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html

# Students Turn Activity 10 # Doctor Decoder

In this activity, you will use `read_html` from Pandas to scrape a Wikipedia article. You will then use the resulting DataFrame to convert a list of medical abbreviations to their full description.

## Instructions

* Use Panda's `read_html` to parse the URL.

* Find the medical abbreviations DataFrame in the list of DataFrames as assign it to `df`.

  * Assign the columns `['abb', 'full_name', 'other']`

* Drop the `other` column from the DataFrame.

* Drop the header row (the first row) and set the index to the `abb` column.

* Loop through the list of medical abbreviations and print the abbreviation along with the full description.

  * Use the DataFrame to perform the lookup.

- - -

In [149]:
import pandas as pd

In [150]:
url = 'https://en.wikipedia.org/wiki/List_of_medical_abbreviations'
med_abbreviations = ['BMR', 'BP', 'ECG', 'MRI', 'qid', 'WBC']

In [151]:
# Use Panda's `read_html` to parse the url
# YOUR CODE HERE
raise NotImplementedError()

NotImplementedError: 

In [None]:
# Find the medical abbreviations DataFrame in the list of DataFrames as assign it to `df`
# Assign the columns `['abb', 'full_name', 'other']`
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# drop the `other` column
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# Drop the first row and set the index to the `abb` column
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# Loop through the list of medical abbreviations and print the abbreviation
# along with the full description.
# Use the DataFrame to perform the lookup.
# YOUR CODE HERE
raise NotImplementedError()