# Web Scraping for Data

In [2]:
# import pandas & numpy library
import pandas as pd
import numpy as np

# Import seaborn and apply its plotting styles
import seaborn as sns
sns.set(font_scale=2, style="white")

# import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style
# set plotting size parameter
plt.rcParams['figure.figsize'] = (12, 5)

# packages helpful for webscraping
import requests
import bs4
from bs4 import BeautifulSoup

#improve resolution
%config InlineBackend.figure_format ='retina'

## Get Presidential Approval Ratings

In [4]:
# specify webpage we want to scrape 
wiki = 'https://en.wikipedia.org/wiki/United_States_presidential_approval_rating'
req = requests.get(wiki)
soup = BeautifulSoup(req.content, 'html') # get contents of web page

wikitables = soup.findAll("table", 'wikitable') # get tables

# extract the tables we want
tbl = wikitables[2] 

# create some empty dataframes
approval_ratings = pd.DataFrame(columns=range(0,11), index = range(0,15)) 

# get the column names for our first table
ind=0
cols_list = []
for header in tbl.find_all('tr'): # specify HTML tags
    header_name = header.find_all('th') # tag containing column names
    for head in header_name:
        cols_list.append(head.get_text()) # get the text from between the tags
approval_ratings.columns = [s.replace('\n','') for s in cols_list] # get rid of new line characters in column names

# fill in the contents for our first table
row_marker = -1
for row in tbl.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td') # different tag than above for table contents
    for column in columns:
        approval_ratings.iat[row_marker, column_marker] = column.get_text()
        column_marker += 1
    row_marker += 1


approval_ratings

Unnamed: 0,Order,President,Highest approval,Lowest approval,High –Low,Highest disapproval,Highest margin,Lowest margin,Final poll,Approvalaverage[12],Pollsper year
0,46\n,Biden\n,"57 (2021-02-02, 2021-04-21) [15]\n",38 (2022-07-26)[16]\n,16\n,59 (2022-07-26)[17]\n,20 (2021-02-02)\n,"−16 (2022-01-16, 2022-06-20)\n",N/A\n,N/A\n,N/A\n
1,45[18][19][20]\n,Trump\n,"49 (2020-01-16, 2020-02-03, 2020-03-13) [18]\n",34 (2021-01-15)[18]\n,15\n,62 (2021-01-15)[18]\n,4 (2020-03-13)[18]\n,−28 (2021-01-15)[18]\n,34 (2021-01-15)[18]\n,41[18]\n,40\n
2,44[21][22]\n,Obama\n,68 (2009-01-23)\n,38 (2011-08-29)[23]\n,31\n,"55 (2014-06-22, 2014-10-12)\n","56 (2009-01-23, 2009-01-24)\n",−18 (2014-10-10)\n,59 (2017-01-19)\n,47.9\n,48.4\n
3,43[24]\n,G. W. Bush\n,90 (2001-09-21)\n,"25 (2008-10-05, 2008-10-12, 2008-11-02)\n",65\n,71 (2008-10-10)\n,83 (2001-09-22)\n,−46 (2008-10-12)\n,34 (2009-01-11)\n,49.4\n,33.7\n
4,42[25]\n,Clinton\n,73 (1998-12-19)\n,37 (1993-06-06)\n,36\n,54 (1994-09-07)\n,48 (2000-12-19)\n,−14 (1994-09-07)\n,66 (2001-01-14)\n,55.1\n,28.5\n
5,41[26]\n,G. H. W. Bush\n,89 (1991-02-28)\n,"29 (1992-08-02, 1992-10-13)\n",60\n,60 (1992-07-31)\n,82 (1991-03-03)\n,"−30 (1992-08-02, 1992-10-13)\n",56 (1993-01-11)\n,60.9\n,39.5\n
6,40[27]\n,Reagan\n,74 (1986-01-30)\n,35 (1983-01-31)\n,36\n,56 (1983-01-28)\n,52 (1986-01-30)\n,−21(1983-01-31)\n,63 (1988-12-29)\n,52.8\n,37.0\n
7,39[28]\n,Carter\n,74 (1977-03-15)\n,"28 (1979-06-26, 1979-10-02)\n",46\n,59 (1979-06-26)\n,66 (1977-03-15)\n,−31 (1979-06-26)\n,34 (1980-12-08)\n,45.5\n,22.7\n
8,38[29]\n,Ford\n,73 (1974-08-13)\n,36 (1975-03-25)\n,34\n,"46 (1975-04-15, 1975-11-18)\n",67 (1974-08-13)\n,"−7 (1975-02-25, 1975-03-25, 1975-04-15)\n",53 (1976-12-13)\n,47.2\n,14.7\n
9,37[30]\n,Nixon\n,67 (1973-01-23)\n,24 (1974-01-02)\n,42\n,66 (1974-08-05)\n,57 (1969-03-17)\n,−42 (1974-08-05)\n,24 (1974-08-05)\n,49.1\n,17.7\n


In [5]:
# Data Cleaning
approval_ratings = approval_ratings.rename(columns={'Approvalaverage[12]': 'Approval average'})
approval_ratings = approval_ratings.replace(r'\n', '', regex=True) # Remove newline characters (\n)
approval_ratings = approval_ratings.replace(r'\[.*\]','', regex=True) # Remove citation symbols, e.g. [12]
approval_ratings = approval_ratings.replace(r'\(.*\)','', regex=True) # Remove parentheses, e.g. (2022-07-26)

approval_ratings

Unnamed: 0,Order,President,Highest approval,Lowest approval,High –Low,Highest disapproval,Highest margin,Lowest margin,Final poll,Approval average,Pollsper year
0,46,Biden,57,38,16,59,20,−16,,,
1,45,Trump,49,34,15,62,4,−28,34.0,41.0,40.0
2,44,Obama,68,38,31,55,56,−18,59.0,47.9,48.4
3,43,G. W. Bush,90,25,65,71,83,−46,34.0,49.4,33.7
4,42,Clinton,73,37,36,54,48,−14,66.0,55.1,28.5
5,41,G. H. W. Bush,89,29,60,60,82,−30,56.0,60.9,39.5
6,40,Reagan,74,35,36,56,52,−21,63.0,52.8,37.0
7,39,Carter,74,28,46,59,66,−31,34.0,45.5,22.7
8,38,Ford,73,36,34,46,67,−7,53.0,47.2,14.7
9,37,Nixon,67,24,42,66,57,−42,24.0,49.1,17.7


In [38]:
# approval_ratings.to_csv('Presidents Approval Ratings.csv', index=False)

## Get Texts From Each President's Wikipedia Page

### Demo

In [5]:
# specify webpage we want to scrape 
wiki = 'https://en.wikipedia.org/wiki/Joe_Biden'
req = requests.get(wiki)
soup = BeautifulSoup(req.content, 'html') # get contents of web page

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Joe Biden - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7ae03750-03ff-49e0-a35a-3121f1943317","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Joe_Biden","wgTitle":"Joe Biden","wgCurRevisionId":1117756551,"wgRevisionId":1117756551,"wgArticleId":145422,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Hungarian-language sources (hu)","Source attribution","All articles with dead external links","Articles with dead external links from January 20

In [7]:
headlines_and_text = soup.select('p, span.mw-headline')

In [8]:
headline = "Introduction"
for tag in headlines_and_text:
    print(tag)
    print("---------------------")

<p class="mw-empty-elt">
</p>
---------------------
<p><b>Joseph Robinette Biden Jr.</b> (<span class="rt-commentedText nowrap"><span class="IPA nopopups noexcerpt" lang="en-fonipa"><a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="'b' in 'buy'">b</span><span title="/aɪ/: 'i' in 'tide'">aɪ</span><span title="'d' in 'dye'">d</span><span title="/ən/: 'on' in 'button'">ən</span></span>/</a></span> <span class="nowrap" style="font-size:85%">(<span class="unicode haudio"><span class="fn"><span style="white-space:nowrap;margin-right:.25em;"><a href="/wiki/File:En-us-Biden2.ogg" title="About this sound"><img alt="" data-file-height="20" data-file-width="20" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/11px-Loudspeaker.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8a/Loudspeaker.svg/17px-Loudspeaker.svg.png 1

In [10]:
page_dict = {}
headline = "Introduction"
for tag in headlines_and_text:
    
    # All headlines in headlines_and_text have a tag of "span", while all paragraphs in headlines_and_text have a tag of "p"
    # Update the headline variable when we encounter a "span"
    if tag.name == 'span':
        headline = tag.text.strip()
        
    # Update page_dict to include the newest paragraph
    else:
        paragraph = tag.text.strip()
        
        # Don't append the paragraph if it is empty
        if len(paragraph) == 0:
            continue
        
        # If there is not yet a paragraph assigned to the current headline, then create the key, value pair in page_dict
        if headline not in page_dict:
            page_dict[headline] = [paragraph]
            
        # If there is already a paragraph assigned to the current headline, then update the value to include the paragraph
        else:
            page_dict[headline] += [paragraph]

# Each value in page_dict is currently a list of all the paragraphs under each headline. 
# Process page_dict so that each value is a string consisting of all the paragraphs under the same headline
page_dict = { k: "\n\n".join(v) for (k, v) in page_dict.items()} 

print(page_dict['Introduction'])

Joseph Robinette Biden Jr. (/ˈbaɪdən/ (listen) BY-dən; born November 20, 1942) is an American politician who is the 46th and current president of the United States. A member of the Democratic Party, he previously served as the 47th vice president from 2009 to 2017 under President Barack Obama and represented Delaware in the United States Senate from 1973 to 2009.

Biden was born and raised in Scranton, Pennsylvania, and moved with his family to Delaware in 1953 when he was ten years old. He studied at the University of Delaware before earning his law degree from Syracuse University. He was elected to the New Castle County Council in 1970 and became the sixth-youngest senator in U.S. history after he was elected to the United States Senate from Delaware in 1972, at age 29. Biden was the chair or ranking member of the Senate Foreign Relations Committee for 12 years. He also chaired the Senate Judiciary Committee from 1987 to 1995; led the effort to pass the Violent Crime Control and Law 

In [11]:
df = pd.DataFrame.from_dict(data=page_dict, orient='index').reset_index().rename(columns={'index': 'Headline', 0: 'Text'})
df['President'] = 'Joe Biden'

df

Unnamed: 0,Headline,Text,President
0,Introduction,Joseph Robinette Biden Jr. (/ˈbaɪdən/ (listen)...,Joe Biden
1,Early life (1942–1965),Joseph Robinette Biden Jr. was born on Novembe...,Joe Biden
2,"Marriages, law school, and early career (1966–...","On August 27, 1966, Biden married Neilia Hunte...",Joe Biden
3,1972 U.S. Senate campaign in Delaware,"In 1972, Biden defeated Republican incumbent J...",Joe Biden
4,Death of wife and daughter,"On December 18, 1972, a few weeks after Biden ...",Joe Biden
5,Second marriage,Biden met the teacher Jill Tracy Jacobs in 197...,Joe Biden
6,Teaching,"From 1991 to 2008, as an adjunct professor, Bi...",Joe Biden
7,Senate activities,"In January 1973, secretary of the Senate Franc...",Joe Biden
8,Opposition to busing,"In the mid-1970s, Biden was one of the Senate'...",Joe Biden
9,Brain surgeries,"In February 1988, after several episodes of in...",Joe Biden


### Put Everything in the Same Dataset

In [6]:
presidents = {
    46: 'Joe_Biden',
    45: 'Donald_Trump',
    44: 'Barack_Obama',
    43: 'George_W._Bush',
    42: 'Bill_Clinton',
    41: 'George_H._W._Bush',
    40: 'Ronald_Reagan',
    39: 'Jimmy_Carter',
    38: 'Gerald_Ford',
    37: 'Richard_Nixon',
    36: 'Lyndon_B._Johnson',
    35: 'John_F._Kennedy',
    34: 'Dwight_D._Eisenhower',
    33: 'Harry_S._Truman',
    32: 'Franklin_D._Roosevelt'
}

In [7]:
president = 'Donald_Trump'
order = 45

wiki = f'https://en.wikipedia.org/wiki/{president}' # indicate the Wikipedia page we want to scrape
req = requests.get(wiki)
soup = BeautifulSoup(req.content, 'html') # get contents of web page
headlines_and_text = soup.select('p, span.mw-headline') # returns a list of all headlines and body texts in order

page_dict = {} # to store headlines and body texts for a single Wikipedia page
headline = "Introduction" # set the first headline to "Introduction", since it is not given by the Wikipedia page
for tag in headlines_and_text:

    # All headlines in headlines_and_text have a tag of "span", while all paragraphs in headlines_and_text have a tag of "p"
    # Update the headline variable when we encounter a "span"
    if tag.name == 'span':
        headline = tag.text.strip()

    # Update page_dict to include the newest paragraph
    else:
        paragraph = tag.text.strip()

        # Don't append the paragraph if it is empty
        if len(paragraph) == 0:
            continue

        # If there is not yet a paragraph assigned to the current headline, then create the key, value pair in page_dict
        if headline not in page_dict:
            page_dict[headline] = [paragraph]

        # If there is already a paragraph assigned to the current headline, then update the value to include the paragraph
        else:
            page_dict[headline] += [paragraph]

# Each value in page_dict is currently a list of all the paragraphs under each headline. 
# Process page_dict so that each value is a string consisting of all the paragraphs under the same headline
page_dict = { k: "\n\n".join(v) for (k, v) in page_dict.items()} 

# Create the dataframe
# Each row is a section in a president's Wikipedia page. Each section has a different headline.
df = pd.DataFrame.from_dict(data=page_dict, orient='index').reset_index().rename(columns={'index': 'Headline', 0: 'Text'})
df['President'] = president
df['Order'] = order

df

Unnamed: 0,Headline,Text,President,Order
0,Introduction,"Donald John Trump (born June 14, 1946) is an A...",Donald_Trump,45
1,Early life,"Donald John Trump was born on June 14, 1946, a...",Donald_Trump,45
2,Family,"In 1977, Trump married Czech model Ivana Zelní...",Donald_Trump,45
3,Religion,Trump went to Sunday school and was confirmed ...,Donald_Trump,45
4,Health habits,"Trump has called golfing his ""primary form of ...",Donald_Trump,45
...,...,...,...,...
77,Promotion of conspiracy theories,"Before and throughout his presidency, Trump ha...",Donald_Trump,45
78,Racial views,Many of Trump's comments and actions have been...,Donald_Trump,45
79,Misogyny and allegations of sexual misconduct,Trump has a history of insulting and belittlin...,Donald_Trump,45
80,Incitement of violence,Research suggests Trump's rhetoric caused an i...,Donald_Trump,45


In [34]:
presidents_scraped_df = pd.DataFrame(columns=['President', 'Order', 'Headline', 'Text'])

for order, president in presidents.items():
    
    wiki = f'https://en.wikipedia.org/wiki/{president}' # indicate the Wikipedia page we want to scrape
    req = requests.get(wiki)
    soup = BeautifulSoup(req.content, 'html') # get contents of web page
    headlines_and_text = soup.select('p, span.mw-headline') # returns a list of all headlines and body texts in order

    page_dict = {} # to store headlines and body texts for a single Wikipedia page
    headline = "Introduction" # set the first headline to "Introduction", since it is not given by the Wikipedia page
    
    # Loop through the headlines and body paragraphs in order ...
    # ..and create a dictionary with headline as key and the list of body paragraphs as value
    for tag in headlines_and_text:

        # All headlines in headlines_and_text have a tag of "span", while all paragraphs in headlines_and_text have a tag of "p"
        # Update the headline variable when we encounter a "span"
        if tag.name == 'span':
            headline = tag.text.strip()

        # Update page_dict to include the newest paragraph
        else:
            paragraph = tag.text.strip()

            # Don't append the paragraph if it is empty
            if len(paragraph) == 0:
                continue

            # If there is not yet a paragraph assigned to the current headline, then create the key, value pair in page_dict
            if headline not in page_dict:
                page_dict[headline] = [paragraph]

            # If there is already a paragraph assigned to the current headline, then update the value to include the paragraph
            else:
                page_dict[headline] += [paragraph]

    # Each value in page_dict is currently a list of all the paragraphs under each headline. 
    # Process page_dict so that each value is a string consisting of all the paragraphs under the same headline
    page_dict = { k: " ".join(v) for (k, v) in page_dict.items()} 

    # Create the dataframe
    # Each row is a section in a president's Wikipedia page. Each section has a different headline.
    df = pd.DataFrame.from_dict(data=page_dict, orient='index').reset_index().rename(columns={'index': 'Headline', 0: 'Text'})
    
    # Fill in the columns "President" and "Order"
    president_name = president.replace('_', ' ')
    df['President'] = president_name
    df['Order'] = order
    
    presidents_scraped_df = presidents_scraped_df.append(df, ignore_index=True)
    
# Data Cleaning
presidents_scraped_df['Text'] = presidents_scraped_df['Text'].replace(r'\[\d+\]','', regex=True)

In [35]:
presidents_scraped_df

Unnamed: 0,President,Order,Headline,Text
0,Joe Biden,46,Introduction,Joseph Robinette Biden Jr. (/ˈbaɪdən/ (listen)...
1,Joe Biden,46,Early life (1942–1965),Joseph Robinette Biden Jr. was born on Novembe...
2,Joe Biden,46,"Marriages, law school, and early career (1966–...","On August 27, 1966, Biden married Neilia Hunte..."
3,Joe Biden,46,1972 U.S. Senate campaign in Delaware,"In 1972, Biden defeated Republican incumbent J..."
4,Joe Biden,46,Death of wife and daughter,"On December 18, 1972, a few weeks after Biden ..."
...,...,...,...,...
645,Franklin D. Roosevelt,32,Lynching,Roosevelt stopped short of joining NAACP leade...
646,Franklin D. Roosevelt,32,Japanese-Americans,The attack on Pearl Harbor raised concerns in ...
647,Franklin D. Roosevelt,32,Jews,There is controversy among historians about Ro...
648,Franklin D. Roosevelt,32,Historical reputation,Roosevelt is widely considered to be one of th...


In [32]:
# presidents_scraped_df.iloc[0]['Text']

In [36]:
presidents_scraped_df.iloc[1]['Text']

'Joseph Robinette Biden Jr. was born on November 20, 1942, at St. Mary\'s Hospital in Scranton, Pennsylvania, to Catherine Eugenia "Jean" Biden (née Finnegan) and Joseph Robinette Biden Sr. The oldest child in a Catholic family, he has a sister, Valerie, and two brothers, Francis and James. Jean was of Irish descent, while Joseph Sr. had English, Irish, and French Huguenot ancestry. Biden\'s paternal line has been traced to stonemason William Biden, who was born in 1789 in Westbourne, England, and emigrated to Maryland in the United States by 1820. Biden\'s father had been wealthy and the family purchased a home in the affluent Long Island suburb of Garden City in the fall of 1946, but he suffered business setbacks around the time Biden was seven years old, and for several years the family lived with Biden\'s maternal grandparents in Scranton. Scranton fell into economic decline during the 1950s and Biden\'s father could not find steady work. Beginning in 1953 when Biden was ten, the f

In [37]:
# presidents_scraped_df.to_csv('Presidents Scraped Wiki Text.csv', index=False)

# END