# Introduction to Python Data Analytics
# Part 5. Web Scraping

Instructor: Kang P. Lee <br>

In [1]:
import requests                     # HTTP library for Python 
from bs4 import BeautifulSoup       # web scraping library

## Extracting a Title and an Author from an Article

In [2]:
url = "https://fivethirtyeight.com/features/trump-weak-president/"
response = requests.get(url)
response.content

b'<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n\t<meta charset="UTF-8">\n\t\t<meta name="viewport" content="width=device-width, initial-scale=1.0">\n\t\t<title>Trump Came In As A Weak President, And He\xe2\x80\x99s Made Himself Weaker | FiveThirtyEight</title>\n<meta name="google-site-verification" content="u4S1gkGQ7IEq1C4S6i3KdYNnPhtuqfsaSIeP0Qqin68" />\n\t\t<script src=\'https://r-login.wordpress.com/remote-login.php?action=js&amp;host=fivethirtyeight.com&amp;id=64146350&amp;t=1501722033&amp;back=https%3A%2F%2Ffivethirtyeight.com%2Ffeatures%2Ftrump-weak-president%2F\' type="text/javascript"></script>\n\t\t<script type="text/javascript">\n\t\t/* <![CDATA[ */\n\t\t\tif ( \'function\' === typeof WPRemoteLogin ) {\n\t\t\t\tdocument.cookie = "wordpress_test_cookie=test; path=/";\n\t\t\t\tif ( document.cookie.match( /(;|^)\\s*wordpress_test_cookie\\=/ ) ) {\n\t\t\t\t\tWPRemoteLogin();\n\t\t\t\t}\n\t\t\t}\n\t\t/* ]]> */\n\t\t</script>\n\t\t<link rel=\'dns-prefetch\' href=\'//s2.

In [3]:
soup = BeautifulSoup(response.content, "html5lib")
soup

<!DOCTYPE html>
<html class="no-js" lang="en"><head>
	<meta charset="utf-8"/>
		<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
		<title>Trump Came In As A Weak President, And He’s Made Himself Weaker | FiveThirtyEight</title>
<meta content="u4S1gkGQ7IEq1C4S6i3KdYNnPhtuqfsaSIeP0Qqin68" name="google-site-verification"/>
		<script src="https://r-login.wordpress.com/remote-login.php?action=js&amp;host=fivethirtyeight.com&amp;id=64146350&amp;t=1501722033&amp;back=https%3A%2F%2Ffivethirtyeight.com%2Ffeatures%2Ftrump-weak-president%2F" type="text/javascript"></script>
		<script type="text/javascript">
		/* <![CDATA[ */
			if ( 'function' === typeof WPRemoteLogin ) {
				document.cookie = "wordpress_test_cookie=test; path=/";
				if ( document.cookie.match( /(;|^)\s*wordpress_test_cookie\=/ ) ) {
					WPRemoteLogin();
				}
			}
		/* ]]> */
		</script>
		<link href="//s2.wp.com" rel="dns-prefetch"/>
<link href="//s1.wp.com" rel="dns-prefetch"/>
<link href="//secure.esp

In [4]:
title = soup.find("h1", {"class": "article-title article-title-single entry-title"}).text
print(title)


							Trump Came In As A Weak President, And He’s Made Himself Weaker						


In [5]:
author = soup.find("a", {"class": "author url fn"}).text
print(author)

Julia Azari


## Extracting a List of Article Titles

In [6]:
url = "https://fivethirtyeight.com/features/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html5lib")

In [7]:
h2s = soup.find_all("h2", {"class": "article-title"})
h2s

[<h2 class="article-title">
 			<a href="https://fivethirtyeight.com/features/chris-sale-still-has-nothing-on-pedro/" name="&amp;lpos=fivethirtyeightFeature&amp;lid=Feature1">
 				Chris Sale Still Has Nothing On Pedro			</a>
 		</h2>, <h2 class="article-title">
 			<a href="https://fivethirtyeight.com/features/lots-of-people-in-cities-still-cant-afford-broadband/" name="&amp;lpos=fivethirtyeightFeature&amp;lid=Feature2">
 				Lots Of People In Cities Still Can’t Afford Broadband			</a>
 		</h2>, <h2 class="article-title">
 			<a href="https://fivethirtyeight.com/features/medicine-is-getting-more-precise-for-white-people/" name="&amp;lpos=fivethirtyeightFeature&amp;lid=Feature3">
 				Medicine Is Getting More Precise … For White People			</a>
 		</h2>, <h2 class="article-title">
 			<a href="https://fivethirtyeight.com/features/significant-digits-for-wednesday-aug-2-2017/" name="&amp;lpos=fivethirtyeightFeature&amp;lid=Feature5">
 				Significant Digits For Wednesday, August 2, 2017			

In [8]:
for h2 in h2s:
    title = h2.find("a").text
    print(title)


				Chris Sale Still Has Nothing On Pedro			

				Lots Of People In Cities Still Can’t Afford Broadband			

				Medicine Is Getting More Precise … For White People			

				Significant Digits For Wednesday, August 2, 2017			

				How To Know When The GOP Is Serious About Tax Reform			

				Opioid Prescriptions Across The U.S.			

				Trump Came In As A Weak President, And He’s Made Himself Weaker			

				Women Are Making Over The Beauty Industry’s Boy’s Club			

				Trump’s Opioid Commission Listened To Public Health Experts			
