In [1]:
# Instructor Turn Activity Soup Intro
# Dependencies
from bs4 import BeautifulSoup as bs

In [2]:
html_string = """
<html>
<head>
<title>
A Simple HTML Document
</title>
</head>
<body>
<p>This is a very simple HTML document</p>
<p>It only has two paragraphs</p>
</body>
</html>
"""

In [3]:
# Create a Beautiful Soup object
soup = bs(html_string, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [4]:
# Print formatted version of the soup
print(soup.prettify())

<html>
 <head>
  <title>
   A Simple HTML Document
  </title>
 </head>
 <body>
  <p>
   This is a very simple HTML document
  </p>
  <p>
   It only has two paragraphs
  </p>
 </body>
</html>



In [5]:
# Extract the title of the HTML document
soup.title

<title>
A Simple HTML Document
</title>

In [6]:
# Extract the text of the title
soup.title.text

'\nA Simple HTML Document\n'

In [7]:
# Clean up the text
soup.title.text.strip()

'A Simple HTML Document'

In [8]:
# Extract the contents of the HTML body
soup.body

<body>
<p>This is a very simple HTML document</p>
<p>It only has two paragraphs</p>
</body>

In [9]:
# Extract the text of the body
soup.body.text

'\nThis is a very simple HTML document\nIt only has two paragraphs\n'

In [10]:
# Text of the first paragraph
soup.body.p.text

'This is a very simple HTML document'

In [11]:
# Extract all paragraph elements
soup.body.find_all('p')

[<p>This is a very simple HTML document</p>, <p>It only has two paragraphs</p>]

In [12]:
# Extract paragraph by index
soup.body.find_all('p')[0]

<p>This is a very simple HTML document</p>

In [13]:
soup.body.find_all('p')[1]

<p>It only has two paragraphs</p>

In [14]:
# The text of the first paragraph
soup.body.find('p').text

'This is a very simple HTML document'

# Students Turn A Soup Starter

## Instructions

* Believe it or not, CNN's website for **1996: Year in Review** is still alive on the web: <http://edition.cnn.com/EVENTS/1996/year.in.review/>

* We have, however, stored the HTML document as a string in your starter file.

* Your task, should you accept it (and you should), is to use Beautiful Soup to scrape and print the following pieces of information:

1. The **title**

2. All **paragraph** texts

3. The top 10 headlines (warning: this one is a bit tricky, and may not come out perfectly!)

## Hints

* For the third task, you will need a means of filtering the data, perhaps over multiple iterations.

## Bonus

* If you finish early, head over to the [Beautful Soup documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to read up on accessing `attributes` and navigating the DOM.

In [15]:
# Dependencies
import os
from bs4 import BeautifulSoup as bs

In [16]:
# Read HTML from file
filepath = os.path.join(".", "Resources", "template.html")
with open(filepath) as file:
    html = file.read()

In [17]:
# Create a Beautiful Soup object
soup = bs(html, 'html.parser')

In [18]:
# Extract title text
title = soup.title.text
print(title)

Top Ten Stories From 1996


In [19]:
# Print all paragraph texts
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
    print(paragraph.text.strip())


What were the biggest stories of the year?

It's a question journalists like to ask themselves at the end of every
                year. Now you can join in the process. Here are our selections for the top ten news
                stories of 1996.

                Disagree with our choices? Then tell us what stories you think were most compelling
                in the poll below.

What makes a big
                story BIG?

It depends on your criteria, of course, and your perspective. That's why we offered
                a poll to find out what you think.
For our list, we polled producers throughout the CNN/Pathfinder family of networks
                and publications, and weighed such criteria as a story's long-term implications,
                geopolitical significance, user interest, amount of coverage, and old-fashioned newsworthiness.
                All these things help make a "big" story big.
By no means do we think our lists are the final word. Even our polls among CNN
 

In [20]:
# Print all ten headlines
tds = soup.find_all('td')
print(tds)
headlines = []
for td in tds:
    if(td.a):
        if(td.a.text):
            headlines.append(td.a.text)

[<td><img align="TOP" height="60" src="logos.gif" width="112"/></td>, <td><img align="TOP" height="60" src="banner.gif" width="360"/></td>, <td rowspan="11" valign="TOP" width="90">
<p align="RIGHT"><b><tt>What were the biggest stories of the year?</tt></b><br/>
<br/>
<font size="2">It's a question journalists like to ask themselves at the end of every
                year. Now you can join in the process. Here are our selections for the top ten news
                stories of 1996.<br/>
<br/>
                Disagree with our choices? Then tell us what stories you think were most compelling
                in the poll below.</font>
</p></td>, <td rowspan="11" width="4"></td>, <td rowspan="11" valign="MIDDLE"><img align="MIDDLE" height="250" src="generic/dot.gif" width="1"/></td>, <td rowspan="11" width="10"></td>, <td colspan="4" valign="TOP">
<p align="CENTER"><img align="MIDDLE" height="24" src="generic/topten.gif" vspace="5" width="263"/>
</p></td>, <td><a href="topten/israel/israe

In [21]:
# Print only the headlines
for x in range(10):
    print(headlines[x])

Israel elects Netanyahu
Crash of TWA Flight 800
Russia elects Yeltsin
U.S. elects Clinton
Hutu-Tutsi conflict in central Africa
Peace, elections in Bosnia
U.S. base bombed in Saudi Arabia
Centennial Olympic Games
Advances against AIDS
Unabomb suspect Ted Kaczynski arrested


# Instructor Turn Activity 3 

In [22]:
# Dependencies
from bs4 import BeautifulSoup
import requests

In [23]:
# URL of page to be scraped
url = 'https://newjersey.craigslist.org/search/sss?sort=rel&query=guitar'

In [24]:
# Retrieve page with the requests module
response = requests.get(url)
print(response)

<Response [200]>


In [25]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [26]:
# Examine the results, then determine element that contains sought info
print(soup.prettify())

﻿
<!DOCTYPE html>
<html class="no-js">
 <head>
  <title>
   north jersey for sale "guitar" - craigslist
  </title>
  <meta content='north jersey for sale "guitar" - craigslist' name="description"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible">
   <link href="https://newjersey.craigslist.org/search/sss" rel="canonical"/>
   <link href="https://newjersey.craigslist.org/search/sss?format=rss&amp;query=guitar&amp;sort=rel" rel="alternate" title='RSS feed for craigslist | north jersey for sale "guitar" - craigslist' type="application/rss+xml"/>
   <link href="https://newjersey.craigslist.org/search/sss?s=120&amp;query=guitar&amp;sort=rel" rel="next"/>
   <meta content="width=device-width,initial-scale=1" name="viewport"/>
   <link href="//www.craigslist.org/styles/cl.css?v=38ee99f34dcd77a615cd2d3c32114559" media="all" rel="stylesheet" type="text/css"/>
   <link href="//www.craigslist.org/styles/search.css?v=84cf86bc094026e12fa066bbbab154ac" media="all" rel="stylesheet" type="text/

# Students Turn Activity 4
# Reddit Scraper

## Instructions

* In this activity, you will scrape the Programmer-Humor.html file provided

* Use Beautiful Soup to scrape only threads that have two or more comments, then print the thread's title, number of comments, and the URL to the thread.

## Bonus

* If you finish early, try to display each thread's top comment in your output!

In [27]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import os

In [28]:
# URL of Python reddit
#url = 'https://www.reddit.com/r/Python/'
url = 'https://www.reddit.com/r/ProgrammingHumor/'

In [29]:
# Retrieve page with the requests module
html = requests.get(url, headers={'User-Agent': 'test'})
print(html)

<Response [200]>


In [30]:
#filepath = os.path.join('.', 'Resources', "Programmer-Humor.html")
#with open(filepath) as file:
#    html = file.read()

In [31]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Whatever in regards to programming/computing that is somehow funny.
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <style>
   /* http://meyerweb.com/eric/tools/css/reset/
    v2.0 | 20110126
    License: none (public domain)
  */

  html, body, div, span, applet, object, iframe,
  h1, h2, h3, h4, h5, h6, p, blockquote, pre,
  a, abbr, acronym, address, big, button, cite, code,
  del, dfn, em, img, input, ins, kbd, q, s, samp,
  small, strike, strong, sub, sup, tt, var,
  b, u, i, center,
  dl, dt, dd, ol, ul, li,
  fieldset, form, label, legend,
  table, caption, tbody, tfoot, thead, tr, th, td,
  article, aside, canvas, details, embed,
  figure, figcaption, footer, header, hgroup,
  menu, nav, output, ruby, section, summary,
  time, mark, audio, video {
    margin: 0;
    padding: 0;
    border: 0;
    font-size: 100%;
    font: inherit;
    vertical-align: baseline;
 

In [37]:
# Find the number of subscribers
number_subscribers = soup.find('p', class_="s1rdie1g-12 qKTMJ")
print(number_subscribers.text)

3.2k


In [38]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('div', class_='top-matter')
print(results)

[]


In [None]:
# Loop through returned results
for result in results:    
    # Retrieve the thread title
    title = result.find('p', class_='title')
    
    # Access the thread's text content
    title_text = title.a.text
    try:
        # Access the thread with CSS selectors
        thread = result.find('li', class_='first')
        # The number of comments made in the thread
        comments = thread.text.lstrip()
        # Parse string, e.g. '47 comments' for possible numeric manipulation
        comments_num = int(comments.split()[0])
        # Access the href attribute with bracket notation
        link = thread.a['href']
        # Run if the thread has comments
        if (comments_num):
            print('\n-----------------\n')
            print(title_text)
            print('Comments:', comments_num)
            print(link)
    except AttributeError as e:
        print(e)

# Instructor Turn Activity 5 Mongo Scraping

In [39]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [40]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [41]:
# Define database and collection
db = client.craigslist_db
collection = db.items

In [42]:
# URL of page to be scraped
url = 'https://newjersey.craigslist.org/search/sss?sort=rel&query=guitar'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'html.parser')

In [43]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('li', class_='result-row')

# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of listing
        title = result.find('a', class_='result-title').text
        # Identify and return price of listing
        price = result.a.span.text
        # Identify and return link to listing
        link = result.a['href']

        # Run only if title, price, and link are available
        if (title and price and link):
            # Print results
            print('-------------')
            print(title)
            print(price)
            print(link)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'price': price,
                'url': link
            }

            collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
2 or Pair of On Stage Music Intrument Guitar Stands
$15
https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6639085733.html
-------------
2 or Pair of On Stage Music Intrument Guitar Stands
$15
https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6640945098.html
-------------
[New] Guitar Hero Live - PlayStation 3 Guitar PS3 Bundle Activision
$80
https://newjersey.craigslist.org/vgm/d/new-guitar-hero-live/6647231931.html
-------------
Guitar Hero III: Legends of Rock Bundle
$250
https://newjersey.craigslist.org/vgm/d/guitar-hero-iii-legends-of/6647234348.html
-------------
Guitar
$30
https://newjersey.craigslist.org/bab/d/guitar/6647551039.html
-------------
***** SEKORA ACOUSTIC GUITAR ** MODEL F1K **
$50
https://newjersey.craigslist.org/msg/d/sekora-acoustic-guitar-model/6643754814.html
-------------
Guitar Hangers
$25
https://newjersey.craigslist.org/msg/d/guitar-hangers/6655931938.html
-------------
Fender Acoustic Guitar Pack (including B

In [44]:
# Display items in MongoDB collection
listings = db.items.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5b514d2c7a0c9a13500bf201'), 'title': '2 or Pair of On Stage Music Intrument Guitar Stands', 'price': '$15', 'url': 'https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6639085733.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf202'), 'title': '2 or Pair of On Stage Music Intrument Guitar Stands', 'price': '$15', 'url': 'https://newjersey.craigslist.org/msg/d/2-or-pair-of-on-stage-music/6640945098.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf203'), 'title': 'Digitech GSP 7 Guitar Effects Processor', 'price': '$100', 'url': 'https://newjersey.craigslist.org/msg/d/digitech-gsp-7-guitar-effects/6624260076.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf204'), 'title': 'Epiphone Riviera Electric Guitar', 'price': '$400', 'url': 'https://newjersey.craigslist.org/msg/d/epiphone-riviera-electric/6647907210.html'}
{'_id': ObjectId('5b514d2d7a0c9a13500bf205'), 'title': 'Vintage 1973 Led Zeppelin houses of the holy guitar tab book', 'price': '$40', 'url': 'https://ne

# Students Turn Activity 6 Hockey Headers

Teamwork! Speed! Mental and physical toughness! Passion! Excitement! Unpredictable matchups down to the wire! What could be better? While these terms could easily be applied to a data science hackathon, we're talking about the magnificent sport of hockey.

Your assignment is to scrape the articles on the front page of the NHL website - which is frequently updated - and then post the results of your scraping to MongoDB.

## Instructions

* Use Beautiful Soup and requests to scrape the title and heading of each article on the front page.

* Post the above information as a MongoDB document and then print all of the documents on the database to the console.

* In addition to the above, post the time and date of the article publication as well.

In [45]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo

In [46]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [47]:
# Define database and collection
db = client.nhl_db
collection = db.articles

In [48]:
# URL of page to be scraped
url = 'https://www.nhl.com/'

In [49]:
# Retrieve page with the requests module
response = requests.get(url)

In [50]:
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [51]:
# Retrieve the parent divs for all articles
results = soup.find_all('li', class_="mixed-feed__item--article")

# Loop through results to retrieve article title, header, and timestamp of article
for result in results:
    title = result.find('h4', class_='mixed-feed__header').text

    lede = result.find('h5', class_='mixed-feed__subheader').text

    # The time and date of article publication
    date = result.find('time')['datetime']

    # Slice the datetime string for the date
    article_date = date[:10]
    # Slice the datetime string for the time
    time = date[11:16]
    # Determine whether article was published in AM or PM
    if (int(time[:2]) >= 13):
        meridiem = 'pm'
    else:
        meridiem = 'am'

    # Concatenate time string
    time = time + meridiem
    print('-----------------')
    print(title)
    print(lede)
    print(article_date)
    print(time)

    # Dictionary to be inserted into MongoDB
    post = {
        'title': title,
        'lede': lede,
        'date': article_date,
        'time_published': time
    }
    collection.insert_one(post)
    

    # Insert dictionary into MongoDB as a document

-----------------
Buffalo Sabres fantasy preview for 2018-19
Reinhart can break out on line with Eichel, Skinner; Hutton potential sleeper with heavier workload on improved roster
2018-08-04
00:00am
-----------------
Skinner eager to join forces with Eichel, Sabres
'Happy to be on his team instead of against him,' forward says after trade from Hurricanes
2018-08-03
17:06pm
-----------------
Fantasy top 250 rankings for 2018-19
Skinner's value skyrockets after trade to Sabres; Neal, Zykov, Kase rise with potential first-line roles; Sprong, Okposo join list
2018-08-03
14:41pm
-----------------
Olczyk, cancer-free since March, has much to celebrate
'NHL on NBC' analyst to attend son's wedding one year after diagnosis
2018-08-03
12:00am
-----------------
Hughes thriving for United States at World Junior Summer Showcase
Forward prospect showing why he's projected No. 1 pick at 2019 NHL Draft
2018-08-03
09:48am
-----------------
2018-19 NHL Trade Tracker
Official deals since June 8
2018-08-0

In [52]:
# Display the MongoDB records created above
articles = db.articles.find()
for article in articles:
    print(article)

{'_id': ObjectId('5b6536127a0c9a0b844e14c9'), 'title': 'Buffalo Sabres fantasy preview for 2018-19', 'lede': 'Reinhart can break out on line with Eichel, Skinner; Hutton potential sleeper with heavier workload on improved roster', 'date': '2018-08-04', 'time_published': '00:00am'}
{'_id': ObjectId('5b6536127a0c9a0b844e14ca'), 'title': 'Skinner eager to join forces with Eichel, Sabres', 'lede': "'Happy to be on his team instead of against him,' forward says after trade from Hurricanes", 'date': '2018-08-03', 'time_published': '17:06pm'}
{'_id': ObjectId('5b6536127a0c9a0b844e14cb'), 'title': 'Fantasy top 250 rankings for 2018-19', 'lede': "Skinner's value skyrockets after trade to Sabres; Neal, Zykov, Kase rise with potential first-line roles; Sprong, Okposo join list", 'date': '2018-08-03', 'time_published': '14:41pm'}
{'_id': ObjectId('5b6536127a0c9a0b844e14cc'), 'title': 'Olczyk, cancer-free since March, has much to celebrate', 'lede': "'NHL on NBC' analyst to attend son's wedding one

# Instructor Turn Activity 7

In [53]:
!pip install splinter



You are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [54]:
from splinter import Browser
from bs4 import BeautifulSoup

# Mac User

In [None]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [None]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# PC User

In [55]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [56]:
url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [57]:
for x in range(1, 6):

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    quotes = soup.find_all('span', class_='text')

    for quote in quotes:
        print('page:', x, '-------------')
        print(quote.text)

    browser.click_link_by_partial_text('Next')

page: 1 -------------
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
page: 1 -------------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
page: 1 -------------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
page: 1 -------------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
page: 1 -------------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
page: 1 -------------
“Try not to become a man of success. Rather become a man of value.”
page: 1 -------------
“It is better to be hated for what you are than to be loved for what you are not.”
page: 1 -------------
“I have not failed. I've just found 10,000 ways that won't work.”
page: 1 -------------
“A woman is like a tea bag; you ne

# Students Turn Activity 8
# Bookscraper

## Instructions

* Go to <http://books.toscrape.com/>

* Scrape the titles and the URLs to all books on this fictional online bookstore. Display the results in console.

* That's it!

* If you're craving extra challenge, try scraping all books by **category**. Good luck!

In [58]:
from splinter import Browser
from bs4 import BeautifulSoup

# Mac Users

In [None]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [None]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# PC Users

In [59]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [62]:
url = 'http://books.toscrape.com/'
browser.visit(url)

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

sidebar = soup.find('ul', class_='nav-list')

categories = sidebar.find_all('li')

category_list = []
url_list = []
book_url_list = []
for category in categories:
    title = category.text.strip()
    category_list.append(title)
    book_url = category.find('a')['href']
    url_list.append(book_url)

book_url_list = ['http://books.toscrape.com/' + url for url in url_list]
titles_and_urls = zip(category_list, book_url_list)

try:
    for title_url in titles_and_urls:
        browser.click_link_by_partial_text('next')
except Exception:
    print('Scraping Complete')


Scraping Complete


In [65]:
#book_url_list
for title_url in titles_and_urls:
    print(title_url)

# Instructor Turn Activity 9 
# Scraping with Pandas

In [66]:
import pandas as pd

###### We can use the read_html function in Pandas to automatically scrape any tabular data from a page.

In [67]:
url = 'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States'

In [68]:
tables = pd.read_html(url)
tables

[                                           0   \
 0                                       State   
 1   Municipal (Within city proper boundaries)   
 2                                     Alabama   
 3                                      Alaska   
 4                                     Arizona   
 5                                    Arkansas   
 6                                  California   
 7                                    Colorado   
 8                                 Connecticut   
 9                                    Delaware   
 10                                    Florida   
 11                                    Georgia   
 12                                     Hawaii   
 13                                      Idaho   
 14                                   Illinois   
 15                                    Indiana   
 16                                       Iowa   
 17                                     Kansas   
 18                                   Kentucky   


In [69]:
type(tables)

list

In [70]:
df = tables[0]
df.columns = ['State', 'Abr.', 'State-hood Rank', 'Capital', 
              'Capital Since', 'Area (sq-mi)', 'Municipal Population', 'Metropolitan', 
              'Metropolitan Population', 'Population Rank', 'Notes']
df.head()

Unnamed: 0,State,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
0,State,Abr.,State-hood,Capital,Capital since,Area (mi²),Population (2010),Notes,,,
1,Municipal (Within city proper boundaries),Metropolitan (Both within the capital city pro...,Rank in state,Rank in US,,,,,,,
2,Alabama,AL,1819,Montgomery,1846,155.4,205764,374536,2.0,115.0,Birmingham is the state's largest city.
3,Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
4,Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887,1.0,5.0,Most populous U.S. state capital and the only ...


In [71]:
df = df.iloc[2:]
df.head()

Unnamed: 0,State,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
2,Alabama,AL,1819,Montgomery,1846,155.4,205764,374536.0,2.0,115.0,Birmingham is the state's largest city.
3,Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
4,Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887.0,1.0,5.0,Most populous U.S. state capital and the only ...
5,Arkansas,AR,1836,Little Rock,1821,116.2,193524,877091.0,1.0,117.0,
6,California,CA,1850,Sacramento,1854,97.2,466488,2527123.0,6.0,35.0,Supreme Court of California is headquartered i...


In [72]:
df.set_index('State', inplace=True)
df.head()

Unnamed: 0_level_0,Abr.,State-hood Rank,Capital,Capital Since,Area (sq-mi),Municipal Population,Metropolitan,Metropolitan Population,Population Rank,Notes
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,AL,1819,Montgomery,1846,155.4,205764,374536.0,2.0,115.0,Birmingham is the state's largest city.
Alaska,AK,1959,Juneau,1906,2716.7,31275,,3.0,,Largest capital by municipal land area. Anchor...
Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887.0,1.0,5.0,Most populous U.S. state capital and the only ...
Arkansas,AR,1836,Little Rock,1821,116.2,193524,877091.0,1.0,117.0,
California,CA,1850,Sacramento,1854,97.2,466488,2527123.0,6.0,35.0,Supreme Court of California is headquartered i...


In [73]:
df.loc['Alabama']

Abr.                                                            AL
State-hood Rank                                               1819
Capital                                                 Montgomery
Capital Since                                                 1846
Area (sq-mi)                                                 155.4
Municipal Population                                        205764
Metropolitan                                                374536
Metropolitan Population                                          2
Population Rank                                                115
Notes                      Birmingham is the state's largest city.
Name: Alabama, dtype: object

DataFrames as HTML
Pandas also had a to_html method that we can use to generate HTML tables from DataFrames.

In [74]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Abr.</th>\n      <th>State-hood Rank</th>\n      <th>Capital</th>\n      <th>Capital Since</th>\n      <th>Area (sq-mi)</th>\n      <th>Municipal Population</th>\n      <th>Metropolitan</th>\n      <th>Metropolitan Population</th>\n      <th>Population Rank</th>\n      <th>Notes</th>\n    </tr>\n    <tr>\n      <th>State</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Alabama</th>\n      <td>AL</td>\n      <td>1819</td>\n      <td>Montgomery</td>\n      <td>1846</td>\n      <td>155.4</td>\n      <td>205764</td>\n      <td>374536</td>\n      <td>2.0</td>\n      <td>115.0</td>\n      <td>Birmingham is the state\'s largest city.</td>\n    </tr>\n    <tr>\n      <th>Alaska</th>\n      <td>AK</td>\n 

In [75]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Abr.</th>      <th>State-hood Rank</th>      <th>Capital</th>      <th>Capital Since</th>      <th>Area (sq-mi)</th>      <th>Municipal Population</th>      <th>Metropolitan</th>      <th>Metropolitan Population</th>      <th>Population Rank</th>      <th>Notes</th>    </tr>    <tr>      <th>State</th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Alabama</th>      <td>AL</td>      <td>1819</td>      <td>Montgomery</td>      <td>1846</td>      <td>155.4</td>      <td>205764</td>      <td>374536</td>      <td>2.0</td>      <td>115.0</td>      <td>Birmingham is the state\'s largest city.</td>    </tr>    <tr>      <th>Alaska</th>      <td>AK</td>      <td>1959</td>      <td>Juneau</td>      <td>1906</td>      <td>2716.7</td>      <td>312

In [76]:
df.to_html('table.html')

In [77]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html

'open' is not recognized as an internal or external command,
operable program or batch file.


# Students Turn Activity 10 # Doctor Decoder

In this activity, you will use `read_html` from Pandas to scrape a Wikipedia article. You will then use the resulting DataFrame to convert a list of medical abbreviations to their full description.

## Instructions

* Use Panda's `read_html` to parse the URL.

* Find the medical abbreviations DataFrame in the list of DataFrames as assign it to `df`.

  * Assign the columns `['abb', 'full_name', 'other']`

* Drop the `other` column from the DataFrame.

* Drop the header row (the first row) and set the index to the `abb` column.

* Loop through the list of medical abbreviations and print the abbreviation along with the full description.

  * Use the DataFrame to perform the lookup.

- - -

In [20]:
import pandas as pd

In [21]:
url = 'https://en.wikipedia.org/wiki/List_of_medical_abbreviations'
med_abbreviations = ['BMR', 'BP', 'ECG', 'MRI', 'qid', 'WBC']

In [22]:
# Use Panda's `read_html` to parse the url
# YOUR CODE HERE
tables = pd.read_html(url)
tables 

[    0                                                  1
 0 NaN  This article includes a list of references, re...,
                                                    0
 0                        v t e Medical abbreviations
 1  0–9 A B C D E F G H I J K L M N O P Q R S T U ...
 2  Latin abbreviations Prescription abbreviations...,
           0                                       1  \
 0    EG abb                            EG full name   
 1       ABG                      arterial blood gas   
 2       ACE           angiotensin-converting enzyme   
 3      ACTH             adrenocorticotropic hormone   
 4       ADH                    antidiuretic hormone   
 5       AED        automated external defibrillator   
 6      AIDS      acquired immunodeficiency syndrome   
 7       ALP                    alkaline phosphatase   
 8       ALT                alanine aminotransferase   
 9       ASA                    acetylsalicylic acid   
 10      AST              aspartate aminotransfera

In [23]:
# Find the medical abbreviations DataFrame in the list of DataFrames as assign it to `df`
# Assign the columns `['abb', 'full_name', 'other']`
# YOUR CODE HERE
df = tables[2]
df.columns = ['abb', 'full_name', 'other']
df.head()

Unnamed: 0,abb,full_name,other
0,EG abb,EG full name,"Other (ver change, need to know...etc.)"
1,ABG,arterial blood gas,
2,ACE,angiotensin-converting enzyme,
3,ACTH,adrenocorticotropic hormone,
4,ADH,antidiuretic hormone,


In [None]:
# drop the `other` column
# YOUR CODE HERE
del df['other']

In [24]:
# Drop the first row and set the index to the `abb` column
# YOUR CODE HERE
df = df.iloc[1:]
df.set_index('abb', inplace=True)
df.head()

Unnamed: 0_level_0,full_name,other
abb,Unnamed: 1_level_1,Unnamed: 2_level_1
ABG,arterial blood gas,
ACE,angiotensin-converting enzyme,
ACTH,adrenocorticotropic hormone,
ADH,antidiuretic hormone,
AED,automated external defibrillator,


In [26]:
# Loop through the list of medical abbreviations and print the abbreviation
# along with the full description.
# Use the DataFrame to perform the lookup.
# YOUR CODE HERE
for abb in med_abbreviations:
    print(abb, df.loc[abb].full_name)

BMR basal metabolic rate
BP blood pressure
ECG electrocardiogram
MRI magnetic resonance imaging
qid 4 times a day
WBC white blood cell
