# Web scraping example



## Imports and functions

In [1]:

import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## scraping and processing
import urllib
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests 
import itertools
import re

import warnings
warnings.filterwarnings('ignore')

# Example 1: use html parser to bulk download files and write them locally (e.g., data files; pdfs)


## Step 1: parse the content of main url

In [2]:
main_link = "https://osse.dc.gov/service/hearing-officer-determinations"

## similar to an API call, use a requests.get call
## to fetch the raw HTML content
raw_content = requests.get(main_link).text

## print the content- not very meaningful
# html_content

## parse that content using beautiful soup
## constructor
## here, we're using the html parser
parsed_content = BeautifulSoup(raw_content, "html.parser")
#print(parsed_content.prettify())


## Step 2: look for the links you want in that content

In [18]:

## first, we try extracting all links on the page with hearing-officer 
## in the link
## using the findAll method: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
## which takes the name of a tag (a)
## and any attributes (href)
links_hearoff = parsed_content.findAll("a", attrs = {"href": re.compile("hearing-officer")})
links_hearoff

## see that combination of a few different patterns:
### 1. base url + year - eg https://osse.dc.gov/page/2009-hearing-officer-determinations
### 2. no base url and directly to page - eg https://osse.dc.gov/page/2019-hearing-officers-determinations

## let's follow one and see what it contains
## use the .get("href") to extract
## what belongs to the href attribute
example_link_hearoff = [link.get("href") 
                    for link in links_hearoff if "2009-hearing-officer-determinations"
                    in link.get("href")]
example_link_hearoff



[<a class="active-trail active-trail active" href="/service/hearing-officer-determinations">Hearing Officer Determinations</a>,
 <a class="active-trail active" href="/service/hearing-officer-determinations">Hearing Officer Determinations</a>,
 <a accesskey="L" class="rsbtn_play" href="//app-na.readspeaker.com/cgi-bin/rsent?customerid=6295&amp;lang=en_us&amp;readid=content-start&amp;url=https://osse.dc.gov/service/hearing-officer-determinations" rel="nofollow" title="Listen to this page using ReadSpeaker webReader">
 <span class="rsbtn_left rsimg rspart"><span class="rsbtn_text"><span>Listen</span></span></span>
 <span class="rsbtn_right rsimg rsplay rspart"></span>
 </a>,
 <a href="https://osse.dc.gov/page/2009-hearing-officer-determinations">2009</a>,
 <a href="https://osse.dc.gov/page/2010-hearing-officer-determinations">2010</a>,
 <a href="https://osse.dc.gov/page/2011-hearing-officer-determinations">2011</a>,
 <a href="https://osse.dc.gov/page/2012-hearing-officer-determinations">2

['https://osse.dc.gov/page/2009-hearing-officer-determinations']

## Step 3: follow the link to one year of data

In [43]:
soupified_link_oneyear = BeautifulSoup(requests.get(example_link_hearoff[0]).text,
                               "html.parser")
#print(soupified_link.prettify())

## see that the individual month links are stored with pattern node
## get those links
month_links = soupified_link_oneyear.findAll("a", attrs = {"href": re.compile("node")})
month_links

## example where we want to get the link
## with March in title
## see from inspecting the page that we need
## to add the site url prefix back
march_link = ["https://osse.dc.gov" + str(link.get("href")) for link in month_links 
             if link.get("title") is not None
              and "March" in link.get("title")]
march_link


[<a href="https://oca.dc.gov/node/160652" target="_blank" title="Agency Performance">Agency Performance</a>,
 <a href="/node/1269671">Amharic (አማርኛ)</a>,
 <a href="/node/1269696">Korean (한국어)</a>,
 <a href="/node/1269736">Spanish (Español)</a>,
 <a href="/node/1269741">Vietnamese (Tiếng Việt)</a>,
 <a href="/node/484662">January 2009</a>,
 <a href="/node/484682">February 2009</a>,
 <a href="/node/540262" title="March 2009">March 2009</a>,
 <a href="/node/540312" title="April 2009">April 2009</a>,
 <a href="/node/540372" title="June 2009">June 2009</a>,
 <a href="/node/540402" title="July 2009">July 2009</a>,
 <a href="/node/540452" title="August 2009">August 2009</a>,
 <a href="/node/540502" title="September 2009">September 2009</a>,
 <a href="/node/540522" title="October 2009">October 2009</a>,
 <a href="/node/540542" title="November 2009">November 2009</a>,
 <a href="/node/540602" title="December 2009">December 2009</a>]

['https://osse.dc.gov/node/540262']

## Step 4: follow the link to one month of data

In [44]:
## follow that march link
soupified_link_oneyear_onemonth = BeautifulSoup(requests.get(march_link[0]).text,
                               "html.parser")

#print(soupified_link_oneyear_onemonth.prettify())

## approach one: search the href attributes for links with HOD and "pdf" in the name
pdfs_oneyear_onemonth = soupified_link_oneyear_onemonth.findAll("a", 
                                                    attrs = {"href": re.compile(".*HOD.*\.pdf$")})
pdfs_oneyear_onemonth[0:5]

## approach two: search the type attributes for ones that are application/pdf type
pdfs_oneyear_onemonth_approach2 = soupified_link_oneyear_onemonth.findAll("a", 
                                                    attrs = {"type": re.compile("pdf")})
pdfs_oneyear_onemonth_approach2[0:5]

[<a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.01_0.pdf" type="application/pdf; length=931802">HOD.0309.01.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.02.pdf" type="application/pdf; length=628539">HOD.0309.02.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.03r.pdf" type="application/pdf; length=1010847">HOD.0309.03r.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.04r.pdf" type="application/pdf; length=1432454">HOD.0309.04r.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.05.pdf" type="application/pdf; length=955708">HOD.0309.05.pdf</a>]

[<a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.01_0.pdf" type="application/pdf; length=931802">HOD.0309.01.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.02.pdf" type="application/pdf; length=628539">HOD.0309.02.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.03r.pdf" type="application/pdf; length=1010847">HOD.0309.03r.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.04r.pdf" type="application/pdf; length=1432454">HOD.0309.04r.pdf</a>,
 <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/HOD.0309.05.pdf" type="application/pdf; length=955708">HOD.0309.05.pdf</a>]

## Step 5: write the results

In [41]:
## write one pdf
### first, need to create connection to the pdf
first_pdf = requests.get(pdfs_oneyear_onemonth_approach2[0].get("href"), verify=False,stream=True)
### then, tell it where to put the file
### and that we're writing rather than reading
folder = open("../public_data/firstpdf.pdf", 'wb')

### write the file
folder.write(first_pdf.content)

### close the connection
folder.close()

### can make more general by writing it with the name of the file
more_specific_fname = "../public_data/" + re.sub(".*attachments\/", "", 
                                                 pdfs_oneyear_onemonth_approach2[0].get("href"))
more_specific_fname

### then would feed that to the open step above

931802

'../public_data/HOD.0309.01_0.pdf'

# Example 2: using xml parser to pull DOJ press releases

- Based on code in: https://github.com/jbencina/dojreleases
- See here for a discussion of html parser versus lxml parser: https://stackoverflow.com/questions/25714417/beautiful-soup-and-table-scraping-lxml-vs-html-parser

## Step 1: parse the content of the main url

In [73]:
doj_news = "https://www.justice.gov/news"
doj_news_soupified = BeautifulSoup(requests.get(doj_news).content, "lxml")
# print(doj_news_soupified.prettify())


## Step 2: from that content, extract links to the press releases

In [54]:
### Approach 1: pulls paragraphs of the relevant type
### using the div tag
all_releases = doj_news_soupified.findAll('div', 
                                          {'class': 'views-field views-field-title'})
all_releases[0:5]

### Approach 2: pulls links of the relevant type using a and href
all_releases_v2 = doj_news_soupified.findAll("a",
                                        {'href': re.compile("opa/pr")})
all_releases_v2[0:5]

[<div class="views-field views-field-title"> <span class="field-content"><a href="/opa/pr/joint-press-statement-us-attorney-general-merrick-garland-and-european-commissioner-justice">Joint Press Statement by U.S. Attorney General Merrick Garland and European Commissioner for Justice Didier Reynders</a></span> </div>,
 <div class="views-field views-field-title"> <span class="field-content"><a href="/opa/pr/former-minister-government-bolivia-owner-florida-based-company-and-three-others-charged">Former Minister of Government of Bolivia, Owner of Florida-Based Company, and Three Others Charged in Bribery and Money Laundering Scheme</a></span> </div>,
 <div class="views-field views-field-title"> <span class="field-content"><a href="/opa/pr/attorney-general-merrick-b-garland-announces-new-effort-reduce-violent-crime">Attorney General Merrick B. Garland Announces New Effort to Reduce Violent Crime</a></span> </div>,
 <div class="views-field views-field-title"> <span class="field-content"><a h

[<a href="/opa/pr/joint-press-statement-us-attorney-general-merrick-garland-and-european-commissioner-justice">Joint Press Statement by U.S. Attorney General Merrick Garland and European Commissioner for Justice Didier Reynders</a>,
 <a href="/opa/pr/former-minister-government-bolivia-owner-florida-based-company-and-three-others-charged">Former Minister of Government of Bolivia, Owner of Florida-Based Company, and Three Others Charged in Bribery and Money Laundering Scheme</a>,
 <a href="/opa/pr/attorney-general-merrick-b-garland-announces-new-effort-reduce-violent-crime">Attorney General Merrick B. Garland Announces New Effort to Reduce Violent Crime</a>,
 <a href="/opa/pr/doj-announces-coordinated-law-enforcement-action-combat-health-care-fraud-related-covid-19">DOJ Announces Coordinated Law Enforcement Action to Combat Health Care Fraud Related to COVID-19</a>,
 <a href="/opa/pr/two-bank-executives-charged-conspiring-launder-hundreds-millions-dollars-through-us-financial">Two Bank E

## Step 3: visit a link and extract the text

In [72]:
## from those results, get the link content
link_follow = [release.get("href") for release in all_releases_v2 
                if "bolivia" in release.get("href")]
link_follow

## add to base url
link_withbase = "https://justice.gov" + link_follow[0]
link_withbase

## visit that page and soupify
one_pr_page_soupified = BeautifulSoup(requests.get(link_withbase).content, 'lxml')
#one_pr_page.prettify()

## get all paragraphs with tag div (generic container for content)
## that has the attribute we can inspect on the page
one_pr_text_list = [p.text for 
            p in one_pr_page_soupified.find('div', 
                {'class': 
            'field field--name-field-pr-body field--type-text-long field--label-hidden'}).find_all('p')]
one_pr_text_list

## join into a single string
one_pr_text_str = " ".join(one_pr_text_list)
one_pr_text_str

## can then write to text file
pr_fname = "../public_data" + re.sub("opa\/pr\/", "", link_follow[0]) + ".txt"
pr_fname

pr_write = open(pr_fname, "w")
pr_write.write(one_pr_text_str)
pr_write.close()

['/opa/pr/former-minister-government-bolivia-owner-florida-based-company-and-three-others-charged']

'https://justice.gov/opa/pr/former-minister-government-bolivia-owner-florida-based-company-and-three-others-charged'

['Two Bolivian nationals and three U.S. citizens were arrested on May 21, and May\xa022, in Florida and Georgia on criminal charges related to their alleged roles in a bribery and money laundering scheme. The former Minister of Government of Bolivia and another former Bolivian official are accused of receiving bribes paid by a U.S. company and individuals to secure a Bolivian government contract, and then using the U.S. financial system to launder those bribes.',
 'According to court documents, Arturo Carlos Murillo Prijic, 57, Sergio Rodrigo Mendez Mendizabal, 51, Luis Berkman, 58, Bryan Berkman, 36, and Philip Lichtenfeld, 48, engaged in the bribery scheme between approximately November 2019 and April 2020.\xa0During that time, Luis Berkman, Bryan Berkman, and Lichtenfeld paid $602,000 in bribes to Bolivian government officials for the benefit of Murillo, the former Minister of Government of Bolivia, Mendez, the former Chief of Staff of the Ministry of Government of Bolivia, and anot

'Two Bolivian nationals and three U.S. citizens were arrested on May 21, and May\xa022, in Florida and Georgia on criminal charges related to their alleged roles in a bribery and money laundering scheme. The former Minister of Government of Bolivia and another former Bolivian official are accused of receiving bribes paid by a U.S. company and individuals to secure a Bolivian government contract, and then using the U.S. financial system to launder those bribes. According to court documents, Arturo Carlos Murillo Prijic, 57, Sergio Rodrigo Mendez Mendizabal, 51, Luis Berkman, 58, Bryan Berkman, 36, and Philip Lichtenfeld, 48, engaged in the bribery scheme between approximately November 2019 and April 2020.\xa0During that time, Luis Berkman, Bryan Berkman, and Lichtenfeld paid $602,000 in bribes to Bolivian government officials for the benefit of Murillo, the former Minister of Government of Bolivia, Mendez, the former Chief of Staff of the Ministry of Government of Bolivia, and another B

'../public_data/former-minister-government-bolivia-owner-florida-based-company-and-three-others-charged.txt'

2833