In [56]:
import json
import requests
import time
import pandas as pd

# importing beautifulsoup
from bs4 import BeautifulSoup

# importing re
import re

In [2]:
base_url = "https://web.archive.org/web/"

# function to go ahead and construct the urls we'll need here
def ret_url(x):
    specific_url = base_url + str(x["timestamp"]) + "/" + x["original"]
    
    return(specific_url)

### Retrieving the JSON object of URLS to scrape

In [3]:
%%time
# starting url
start_url = "https://web.archive.org/cdx/search/cdx?url=http://www.chinese-embassy.org.uk/eng/PressandMedia/&matchType=prefix&output=json&limit=1000&showResumeKey=true&filter=statuscode:200&collapse=digest" 

# using requests to pull the json webpage
r = requests.get(start_url)

# setting the scrape response to the json from the request
scrape_response = r.json()

# as long as the last element is a resume key
while len(scrape_response[-1]) == 1:
    # popping the last item (resume key) & second to last item (blank list)
    resume_key = scrape_response.pop(-1)[0]
    if len(scrape_response[-1]) == 0:
        scrape_response.pop(-1)
        
    # updating the wayback url
    wayback_url = start_url + "&ResumeKey=" + resume_key   
    
    # pulling the resumed query
    r = requests.get(wayback_url).json()
    
    # extending the list of responses
    scrape_response.extend(r)
    
    # sleeping 5 seconds each time to be nice to the server
    time.sleep(5)

Wall time: 542 ms


In [4]:
# converting the cdx response to a df
scrape_df = pd.DataFrame(scrape_response[1:], columns=scrape_response[0])

In [5]:
# removing the duplicates
scrape_df.drop_duplicates(subset="original", inplace=True)

In [6]:
# assigning the storage url
scrape_df["storage_url"] = scrape_df.apply(ret_url, axis=1)

### Pulling the elements from a single webpage (to start)

In [8]:
# Starting w/the URL we'll practice this on
# we need to pull the col and just the value from the col for now
url = scrape_df.head(1)["storage_url"].values[0]

In [9]:
url

'https://web.archive.org/web/20211130031323/http://www.chinese-embassy.org.uk/eng/PressandMedia/202008/t20200824_3278215.htm'

In [65]:
scraper(url)

Wall time: 2.14 s


["Embassy Spokesperson's Remarks on Chinese operating overseas fishing vessels",
 '2020-08-24 23:45',
 "Question: According to British media's report, Ecuador on alert over huge Chinese fishing fleet off Galapagos Islands. What is the comment of the Chinese Embassy in the UK?Embassy Spokesperson: China and Ecuador are in friendly communication through bilateral channels. On August 6, the fishery authorities of the two countries held a productive video teleconference, and reached positive consensus. Meanwhile, as a contribution to the protection of fishery resources in the region, China's fishery authority has decided to ban fishing in the high seas west of the Galapagos Islands Marine Reserve from September to November this year, which has been appreciated by Ecuador and other relevant countries.China, as a responsible and big fishing country, attaches great importance to the protection of the marine environment and resources and implements the strictest possible monitoring and control

In [50]:
# child function
# retrieves the title of the page
def title_pull(soup):
    # pulling all the elements with the bigtitle class that are also td tags (set consists of single element)
    title = soup.find_all("td", class_="bigtitle")[0].text
    
    return(title)

In [51]:
# child function
# retrieves date of release
def date_pull(soup):
    # pulling all elements that are td tags with set attributes (set consists of single element)
    date = soup.find_all("td", height="28", align="center")[0].text
    
    return(date)

In [58]:
# child function
# retrieves body of text
def body_pull(soup):
    # pulling all elements that are p tags (set consists solely of the body of the press release)
    paragraphs = soup.find_all("p")
    
    # concatenates the paragraphs present in the page. Assumes individual paragraphs are unimportant (may be untrue)
    body = "".join([x.text for x in paragraphs])
    
    return(body)

In [60]:
# child function
# retrieves ambassador name
def amb_pull(soup):
    # pulling the ambassador bio link (correcting to ensure full url)
    bio_url = base_url + soup.find_all(href=re.compile("/ambassador/dsjl"))[0]["href"][5:]
    
    # pulling the biography page and setting up a soup object
    bio_page = requests.get(bio_url)
    bio_soup = BeautifulSoup(bio_page.content, "html.parser")
    
    # pulling out the ambassadors name line from the bio
    bios = bio_soup.find_all(class_="bigtitle")
    
    # running a regex on the title (which will be followed by their name)
    ambassador = re.search(r"(Ambassador.+)", bios[0].text).group()
    
    return(ambassador)

In [64]:
# parent function
# calls scraping sub-functions
# returns dict w/vals for each
def scraper(url):
    # starting time
    start_time = time.time()
    
    # pulling the page of the url and creating a soup object from it
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # pulling the title, date, body, and ambassador
    title = title_pull(soup)
    date = date_pull(soup)
    body = body_pull(soup)
    ambassador = amb_pull(soup)

    # calculating the delay
    delay = time.time() - start_time
    
    # pausing for the total time * 2 to be nice to the server we're querying from
    time.sleep(delay * 2)
    
    # returning the pulled stuff from the page(s)
    return([title, date, body, ambassador])