# Web Scrape

### Import packages

In [1]:
%matplotlib inline
import pandas as pd
import urllib
from urllib import parse
from urllib import request
from matplotlib import pyplot as plt
from datetime import timedelta, date
import bs4
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import defaultdict

### Generate the request url

In [2]:
def mk_xwlb_url(base='http://mrxwlb.com/{}', y=2016, m=1, d=1):
    '''
    Make url based on date parameters
    The url we're interested in looks like http://mrxwlb.com/2016年1月1日新闻联播文字版
    Since it contains Chinese characters, we made a function to format it into proper requests
    This function fills year, month and date into the url so that we can properly format 2 years of urls
    '''
    page_name = '{}年{}月{}日新闻联播文字版'.format(y, m, d)
    page_name = parse.quote(page_name)
    url = base.format(page_name)
    return url


# Test if the url is properly made
print(mk_xwlb_url(m=1))

http://mrxwlb.com/2016%E5%B9%B41%E6%9C%881%E6%97%A5%E6%96%B0%E9%97%BB%E8%81%94%E6%92%AD%E6%96%87%E5%AD%97%E7%89%88


### Define dates of interests

In [3]:
def inclusive_daterange(start_date, end_date):
    '''
    This function takes in a start date and an end date, returns the dates between these two dates
    note: start_date and end_date are both datetime.date objects
    As its name suggests, the returned dates include start date and end date
    '''
    results = []
    for n in range( int((end_date - start_date).days) + 1 ):
        results.append( start_date + timedelta(n) )
    return results

start_date = date(2016, 1, 1)
end_date = date(2017, 12, 31)

# test inclusive_daterange function
inclusive_daterange(start_date, end_date)[:10]

[datetime.date(2016, 1, 1),
 datetime.date(2016, 1, 2),
 datetime.date(2016, 1, 3),
 datetime.date(2016, 1, 4),
 datetime.date(2016, 1, 5),
 datetime.date(2016, 1, 6),
 datetime.date(2016, 1, 7),
 datetime.date(2016, 1, 8),
 datetime.date(2016, 1, 9),
 datetime.date(2016, 1, 10)]

### Start scraping
Note: Show the breaks of the scraping process

In [4]:
# we use a dictionary called soups to store all the scraped pages
soups = {}

for this_date in tqdm(inclusive_daterange(start_date, end_date)):
    # Some web pages are missing, we use try and except to deal with broken links that will results in HTTPErrors
    try:
        # proper url is made to send requests
        url = mk_xwlb_url(y=this_date.year, m=this_date.month, d=this_date.day)
        
        # send request, get back raw html and saved in a variable called html_doc
        html_doc = request.urlopen(url)
        
        # use BeautifulSoup to read / parse html object
        soup = BeautifulSoup(html_doc, "lxml")
        
        # save the resulting soup object into the dictionary we made earlier
        soups[this_date.strftime('%Y-%m-%d')] = soup
    except urllib.error.HTTPError:
        # if the requested url results in error, make a note by printing out the date and continue the loop
        print(this_date)
        continue

 22%|██▏       | 162/731 [05:26<19:06,  2.02s/it]

2016-06-10


 36%|███▌      | 263/731 [08:28<15:04,  1.93s/it]

2016-09-19


 37%|███▋      | 269/731 [08:38<14:50,  1.93s/it]

2016-09-25


100%|██████████| 731/731 [23:08<00:00,  1.90s/it]


As shown in the result, the transcripts from 2016-06-10, 2016-09-19, and 2016-09-25 are not found. Given that we have more than 700 transcrpts to go through, skipping three would not undermine the quality of the analysis.

### Convert the texts into a formated data frame
1. Data cleaning;
2. Assign unique section titles to each paragraph of the respective section;
3. Assign dates to each paragraph;
4. Note: each section (under a unique title) may have multiple paragraphs; each daily transcript may have multiple sections.

In [5]:
def soup_to_df(date_of_record, soup):
    '''
    Convert a soup object into a pandas dataframe for easy storage and export
    input: date_of_record: the date of the webpage
           soup: the soup object of the webpage of the corresponding date
    output: The pandas dataframe of the relevant information.
            Each row has 3 columns: title, content and date
            date will be the same for the entire dataframe,
            title will be the paragraphs (identified by tag p) that are blackened in the webpage
            content will be the paragraphs that are not blackened
            content will also be associated with the title that appears right before it
    '''
    # initializations
    p_tags = soup.findAll('p')
    title = '<UNKNOWN>'
    results = []
    
    # skip the first 2 items as they are placeholders in the webpage
    for tag in p_tags[2:]:
        tag_str = tag.string
        # we found that some of the p-tagged paragraphs are code comments which are not useful. 
        # This if statement excludes them
        if type(tag_str) == bs4.element.Comment:
            continue
        else:
            # Some titles are hidden inside a paragraph
            # We find them out and update the title
            # Otherwise we treat the paragraph as content
            # Note here we define a title's content to be the title itself
            for child in tag.children:
                if child.string:
                    if child.name and child.name == 'strong':
                        title = content = child.string.strip()
                    else:
                        content = child.string.strip()
                    # Deal with some irregularities
                    if title != '<UNKNOWN>' and content != 'Comments are disabled.':
                        results.append({'title': title, 'content': content, 'date': date_of_record})
    # return the pandas dataframe
    return pd.DataFrame(results)

# Concatenate all dates' dataframes into one dataframe and output the data
content_df = pd.concat([soup_to_df(k, v) for k, v in tqdm(soups.items())], ignore_index=True)
content_df.to_csv("../data/1_xwlb_content_title_daily.csv", index=False, encoding='gb18030')
content_df.head()

100%|██████████| 728/728 [00:00<00:00, 1010.29it/s]


Unnamed: 0,content,date,title
0,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...
1,中国人民解放军陆军领导机构、中国人民解放军火箭军、中国人民解放军战略支援部队成立大会2015...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...
2,下午4时，成立大会开始，全场高唱国歌。仪仗礼兵护卫着鲜艳军旗，正步行进到主席台前。习近平将军...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...
3,授旗仪式后，习近平致训词。他指出：“成立陆军领导机构、火箭军、战略支援部队，是党中央和中央军...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...
4,习近平强调，陆军是党最早建立和领导的武装力量，历史悠久，敢打善战，战功卓著，为党和人民建立了...,2016-01-01,陆军领导机构火箭军战略支援部队成立大会在京举行 习近平向中国人民解放军陆军火箭军战略支援部队...
