## Importing libraries and setting up functions

In [1]:
import json
import requests
import time
import pandas as pd

# importing beautifulsoup
from bs4 import BeautifulSoup

# importing re
import re

In [2]:
# child function
# retrieves the title of the page
def title_pull(soup):
    # pulling all the elements with the bigtitle class that are also td tags (set consists of single element)
    title = soup.find_all("td", class_="bigtitle")[0].text
    
    return(title)

In [3]:
# child function
# retrieves date of release
def date_pull(soup):
    # pulling all elements that are td tags with set attributes (set consists of single element)
    date = soup.find_all("td", height="28", align="center")[0].text
    
    return(date)

In [4]:
# child function
# retrieves body of text
def body_pull(soup):
    # pulling all elements that are p tags (set consists solely of the body of the press release)
    paragraphs = soup.find_all("p")
    
    # concatenates the paragraphs present in the page. Assumes individual paragraphs are unimportant (may be untrue)
    body = "".join([x.text for x in paragraphs])
    
    return(body)

In [18]:
# child function
# retrieves ambassador name
def amb_pull(soup):
    # pulling the ambassador bio link (correcting to ensure full url)
    bio_url = base_url + soup.find_all(href=re.compile("/ambassador/dsjl"))[0]["href"][5:]
    
    # pulling the biography page and setting up a soup object
    bio_page = requests.get(bio_url)
    bio_soup = BeautifulSoup(bio_page.content, "html.parser")
    
    # pulling out the ambassadors name line from the bio
    bio = bio_soup.find_all(class_="bigtitle")
    
    # running a regex on the title (which will be followed by their name)
    ambassador = re.search(r"(Ambassador.+)", bio[0].text).group()
    
    return(ambassador)

In [68]:
# parent function
# calls scraping sub-functions
# returns dict w/vals for each
def scraper(url):    
    # starting time
    start_time = time.time()
    
    # pulling the page of the url and creating a soup object from it
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    # temp try/except block
    try:
        # pulling the title, date, body, and ambassador
        title = title_pull(soup)
        date = date_pull(soup)
        body = body_pull(soup)
        ambassador = amb_pull(soup)
        
    except:
        print(url)
        return(float("NaN"))

    # calculating the delay
    delay = time.time() - start_time
    
    # pausing for the total time * 2 to be nice to the server we're querying from
    time.sleep(delay * 2)
    
    # returning the pulled stuff from the page(s)
    return([title, date, body, ambassador])

In [7]:
# setting the base url
base_url = "https://web.archive.org/web/"

# function to go ahead and construct the urls we'll need here
def ret_url(x):
    specific_url = base_url + str(x["timestamp"]) + "/" + x["original"]
    
    return(specific_url)

In [52]:
# function to retrieve the type of page from the url (i.e. htm, pdf, docx)
def ret_page_type(x):
    url_split = x.split(".")
    
    # pulling the last element in the url split on "." (the page type)
    page_type = "." + url_split[-1]
    
    # returning the page type
    return(page_type)

### Retrieving the JSON object of URLS to scrape

In [8]:
%%time
# starting url
start_url = "https://web.archive.org/cdx/search/cdx?url=http://www.chinese-embassy.org.uk/eng/PressandMedia/&matchType=prefix&output=json&limit=1000&showResumeKey=true&filter=statuscode:200&collapse=digest" 

# using requests to pull the json webpage
r = requests.get(start_url)

# setting the scrape response to the json from the request
scrape_response = r.json()

# as long as the last element is a resume key
while len(scrape_response[-1]) == 1:
    # popping the last item (resume key) & second to last item (blank list)
    resume_key = scrape_response.pop(-1)[0]
    if len(scrape_response[-1]) == 0:
        scrape_response.pop(-1)
        
    # updating the wayback url
    wayback_url = start_url + "&ResumeKey=" + resume_key   
    
    # pulling the resumed query
    r = requests.get(wayback_url).json()
    
    # extending the list of responses
    scrape_response.extend(r)
    
    # sleeping 5 seconds each time to be nice to the server
    time.sleep(5)

Wall time: 1.06 s


In [9]:
# converting the cdx response to a df
scrape_df = pd.DataFrame(scrape_response[1:], columns=scrape_response[0])

In [10]:
# removing the duplicates
scrape_df.drop_duplicates(subset="original", inplace=True)

In [11]:
# assigning the storage url
scrape_df["storage_url"] = scrape_df.apply(ret_url, axis=1)

In [58]:
%%time
# applying the function to figure out the types of the pages
scrape_df["page_type"] = scrape_df["storage_url"].apply(ret_page_type)

Wall time: 1 ms


In [61]:
scrape_df["page_type"].value_counts()

.htm                                               374
.pdf                                                51
.doc                                                 2
.uk/eng/PressandMedia/ChinaNewsletter201701/         1
.uk:80/eng/PressandMedia/ChinaNewsletter201701/      1
.uk/eng/PressandMedia/Spokepersons/                  1
.uk/eng/PressandMedia/Information/                   1
.uk:80/eng/PressandMedia/Spokepersons/               1
.uk:80/eng/PressandMedia/MediaService/               1
.uk/eng/PressandMedia/MediaService/                  1
.uk:80/eng/PressandMedia/Information/                1
Name: page_type, dtype: int64

In [64]:
# filtering out where the page type isn't htm
scrape_df = scrape_df[scrape_df["page_type"] == ".htm"]

### Pulling the elements from full set

In [69]:
%%time
# testing out how this works with the full dataset...
scrape_df["scraped_data"] = scrape_df["storage_url"].apply(scraper)

https://web.archive.org/web/20100819023353/http://www.chinese-embassy.org.uk:80/eng/PressandMedia/default.htm
Wall time: 31min 58s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Looking at the url that was returned above, we can safely ignore it. So we'll go ahead and drop the single nan value.

In [82]:
scrape_df = scrape_df.dropna()

In [91]:
scrape_df["storage_url"]

0      https://web.archive.org/web/20211130031323/htt...
1      https://web.archive.org/web/20211130052058/htt...
2      https://web.archive.org/web/20211130025452/htt...
3      https://web.archive.org/web/20211130042447/htt...
4      https://web.archive.org/web/20211130054122/htt...
                             ...                        
818    https://web.archive.org/web/20150501185000/htt...
822    https://web.archive.org/web/20150501181307/htt...
823    https://web.archive.org/web/20151019074049/htt...
826    https://web.archive.org/web/20150501180024/htt...
827    https://web.archive.org/web/20151019091307/htt...
Name: storage_url, Length: 373, dtype: object

In [93]:
pd.DataFrame(scrape_df["scraped_data"].to_list(), columns=['title', 'date', "body", "ambassador"])

Unnamed: 0,title,date,body,ambassador
0,Embassy Spokesperson's Remarks on Chinese oper...,2020-08-24 23:45,"Question: According to British media's report,...",Ambassador Zheng Zeguang
1,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:03,"Question: In an open letter, MPs will be urged...",Ambassador Zheng Zeguang
2,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:05,Question: A letter signed by religious leaders...,Ambassador Zheng Zeguang
3,Embassy Spokesperson's Remarks on The Times' R...,2020-08-29 07:08,"Question: On 28 August, The Times carried a re...",Ambassador Zheng Zeguang
4,Embassy Spokesperson's Remarks on the Claim by...,2020-09-01 23:40,"Question: On August 30th, Tom Tugendhat, Chair...",Ambassador Zheng Zeguang
...,...,...,...,...
368,The Chinese Embassy in the UK Responds to Comm...,2010/09/17,The following is a letter from Mrs. Dai Qingli...,Ambassador Liu Xiaoming
369,Chinese Embassy in the UK Refutes Jeremy Clark...,2012/01/13,"On 7th January 2012, The Sun carried Jeremy Cl...",Ambassador Liu Xiaoming
370,Chinese Embassy in the UK Refutes Jeremy Clark...,2012/01/13,"On 7th January 2012, The Sun carried Jeremy Cl...",Ambassador Liu Xiaoming
371,The Chinese Embassy refutes Guardian's report ...,2012/08/01,"On July 25th, The Guardian carried an article ...",Ambassador Liu Xiaoming


In [94]:
# creating a new df of the split scraped data & the storage url
# reseting the index on the storage url in transition to fix the issue with the indicies mismatch
scraped = pd.concat([scrape_df["storage_url"].reset_index(drop=True), 
                     pd.DataFrame(scrape_df["scraped_data"].to_list(), columns=['title', 'date', "body", "ambassador"])], 
                    axis=1)

In [99]:
# converting the date column to a datetime format
scraped["date"] = pd.to_datetime(scraped["date"])

In [100]:
scraped

Unnamed: 0,storage_url,title,date,body,ambassador
0,https://web.archive.org/web/20211130031323/htt...,Embassy Spokesperson's Remarks on Chinese oper...,2020-08-24 23:45:00,"Question: According to British media's report,...",Ambassador Zheng Zeguang
1,https://web.archive.org/web/20211130052058/htt...,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:03:00,"Question: In an open letter, MPs will be urged...",Ambassador Zheng Zeguang
2,https://web.archive.org/web/20211130025452/htt...,Embassy Spokesperson's Remarks on issues relat...,2020-08-25 21:05:00,Question: A letter signed by religious leaders...,Ambassador Zheng Zeguang
3,https://web.archive.org/web/20211130042447/htt...,Embassy Spokesperson's Remarks on The Times' R...,2020-08-29 07:08:00,"Question: On 28 August, The Times carried a re...",Ambassador Zheng Zeguang
4,https://web.archive.org/web/20211130054122/htt...,Embassy Spokesperson's Remarks on the Claim by...,2020-09-01 23:40:00,"Question: On August 30th, Tom Tugendhat, Chair...",Ambassador Zheng Zeguang
...,...,...,...,...,...
368,https://web.archive.org/web/20150501185000/htt...,The Chinese Embassy in the UK Responds to Comm...,2010-09-17 00:00:00,The following is a letter from Mrs. Dai Qingli...,Ambassador Liu Xiaoming
369,https://web.archive.org/web/20150501181307/htt...,Chinese Embassy in the UK Refutes Jeremy Clark...,2012-01-13 00:00:00,"On 7th January 2012, The Sun carried Jeremy Cl...",Ambassador Liu Xiaoming
370,https://web.archive.org/web/20151019074049/htt...,Chinese Embassy in the UK Refutes Jeremy Clark...,2012-01-13 00:00:00,"On 7th January 2012, The Sun carried Jeremy Cl...",Ambassador Liu Xiaoming
371,https://web.archive.org/web/20150501180024/htt...,The Chinese Embassy refutes Guardian's report ...,2012-08-01 00:00:00,"On July 25th, The Guardian carried an article ...",Ambassador Liu Xiaoming


In [102]:
scraped.to_json("PRC-UK_Embassy_press_releases.ndjson",orient="records",lines=True)