In [1]:
import jieba
import pinyin
import requests
import traceback
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm
from bs4 import BeautifulSoup as bs

In [2]:
content_folder = Path('./content')
content_folder.mkdir(exist_ok=True)


def get_name_by_url(url):
    '''
    Get name from $url

    :param: url: input url

    :return: the name parsed from url
    '''
    return url.split('/')[-1]


def content_from_url(url, override=False):
    '''
    Get content from $url.
    It checks content_folder first,
    if not found, it requests the content and saves.
    
    :param: url: input url
    :param: override: whether to overwrite existing contents, default is False

    :return: content: the content being requested from url
    '''
    name = get_name_by_url(url)
    p = content_folder.joinpath(name)

    if not p.is_file() or override:
        resp = requests.get(url)
        content = resp.content
        with open(p, 'wb') as f:
            f.write(content)
    else:
        content = open(p, 'rb').read()

    return content
        

In [3]:
url = 'http://www.gov.cn/guoqing/2009-10/09/content_2582666.htm'

content = content_from_url(url)

soup = bs(content)
type(soup)

bs4.BeautifulSoup

In [4]:

lst = []

_table = soup.find('table')
for td in tqdm(_table.find_all('td'), 'Search table'):
    try:
        text = td.text
        href = td.find('a').attrs['href']
        lst.append(dict(
            year=text.strip().replace(' ', ''),
            url=href
        ))
    except AttributeError:
        traceback.print_exc()
        continue

df = pd.DataFrame(lst)
df['name'] = df['url'].map(get_name_by_url)
raw_table = df
raw_table

Search table:   0%|          | 0/63 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "C:\Users\zcc\AppData\Local\Temp\ipykernel_26108\2796673242.py", line 7, in <module>
    href = td.find('a').attrs['href']
AttributeError: 'NoneType' object has no attribute 'attrs'
Traceback (most recent call last):
  File "C:\Users\zcc\AppData\Local\Temp\ipykernel_26108\2796673242.py", line 7, in <module>
    href = td.find('a').attrs['href']
AttributeError: 'NoneType' object has no attribute 'attrs'
Traceback (most recent call last):
  File "C:\Users\zcc\AppData\Local\Temp\ipykernel_26108\2796673242.py", line 7, in <module>
    href = td.find('a').attrs['href']
AttributeError: 'NoneType' object has no attribute 'attrs'


Unnamed: 0,year,url,name
0,1949,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
1,1950,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434234.htm
2,1951,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434235.htm
3,1952,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434236.htm
4,1953,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434238.htm
5,1954,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434242.htm
6,1955,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434243.htm
7,1956,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434249.htm
8,1957,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434252.htm
9,1958,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434254.htm


In [5]:
df = raw_table.copy()

contents = df['url'].map(content_from_url)

lst = []

for j, content in tqdm(enumerate(contents), 'Parse contents'):
    soup = bs(content)
    ps = soup.find_all('font', attrs=dict(id='Zoom'))[0].find_all('p')
    lst.append([e.text for e in ps])

    if '下一页' in soup.text:
        url = df.iloc[j]['url']
        url2 = url.replace('.htm', '_2.htm')
        print('----', url, url2)
        content2 = content_from_url(url2)
        soup2 = bs(content2)
        ps2 = soup2.find_all('font', attrs=dict(id='Zoom'))[0].find_all('p')
        lst[-1] += [e.text for e in ps2]

df['events'] = lst
df['num_events'] = df['events'].map(len)

detail_table = df
detail_table.iloc[:4]

Parse contents: 0it [00:00, ?it/s]

---- http://www.gov.cn/test/2009-10/09/content_1434220.htm http://www.gov.cn/test/2009-10/09/content_1434220_2.htm
---- http://www.gov.cn/test/2009-10/09/content_1434234.htm http://www.gov.cn/test/2009-10/09/content_1434234_2.htm
---- http://www.gov.cn/test/2009-10/09/content_1434297.htm http://www.gov.cn/test/2009-10/09/content_1434297_2.htm
---- http://www.gov.cn/test/2009-10/09/content_1434365.htm http://www.gov.cn/test/2009-10/09/content_1434365_2.htm


Unnamed: 0,year,url,name,events,num_events
0,1949,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm,"[ １９４９年, １０月１日　中华人民共和国中央人民政府成立。此前，中国人民政...",16
1,1950,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434234.htm,"[ １９５０年, １月６日　北京市军事管制委员会颁发布告，宣布收回在京的外国兵...",21
2,1951,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434235.htm,"[ １９５１年, ２月２３日　政务院通过《中华人民共和国劳动保险条例》，自本年...",7
3,1952,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434236.htm,"[ １９５２年, １月２６日　中共中央发出《关于首先在大中城市开展“五反”斗争...",8


In [6]:
df.columns

Index(['year', 'url', 'name', 'events', 'num_events'], dtype='object')

In [9]:
df = detail_table.copy()

lst = []
drop_titles = []
for j in tqdm(df.index, 'Build long table'):
    year = df.loc[j, 'year']
    url = df.loc[j, 'url']
    name = df.loc[j, 'name']
    events = df.loc[j, 'events']

    for event in events:
        event = event.strip()
        if len(event) == 5 and '年' in event:
            drop_titles.append(event)
            continue

        lst.append(dict(
            year=year,
            date=event.split()[0],
            event=event,
            url=url,
            name=name
        ))

print('Ignore the year titles:')
print(drop_titles)

df = pd.DataFrame(lst)

long_table = df
long_table.to_csv('long_table.csv')
long_table

Build long table:   0%|          | 0/60 [00:00<?, ?it/s]

Ignore the year titles:
['１９４９年', '１９５０年', '１９５１年', '１９５２年', '１９５３年', '１９５４年', '１９５５年', '１９５６年', '１９５７年', '１９５８年', '１９５９年', '１９６０年', '１９６１年', '１９６２年', '１９６３年', '１９６４年', '１９６５年', '１９６６年', '１９６７年', '１９６８年', '１９６９年', '１９７０年', '１９７１年', '１９７２年', '１９７３年', '１９７４年', '１９７５年', '１９７６年', '１９７７年', '１９７８年', '１９７９年', '１９８０年', '１９８１年', '１９８２年', '１９８３年', '１９８４年', '１９８５年', '１９８６年', '１９８７年', '１９８８年', '１９８９年', '１９９０年', '１９９１年', '１９９２年', '１９９３年', '１９９４年', '１９９５年', '１９９６年', '１９９７年', '１９９８年', '１９９９年', '２０００年', '２００１年', '２００２年', '２００３年', '２００４年', '２００５年', '２００６年', '２００８年', '２００９年']


Unnamed: 0,year,date,event,url,name
0,1949,１０月１日,１０月１日　中华人民共和国中央人民政府成立。此前，中国人民政治协商会议第一届全体会议于９月２...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
1,1949,１０月２日,１０月２日　苏联政府决定同新中国建立外交关系。３日，周恩来复电表示热忱欢迎，并互派大使。自１...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
2,1949,１０月９日,１０月９日　中国人民政治协商会议第一届全国委员会第一次会议召开，选举毛泽东为政协第一届全国委...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
3,1949,１０月１３日,１０月１３日　中国新民主主义青年团中央召开常委扩大会议，通过建立中国少年儿童队的决议。１９５...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
4,1949,１０月２１日,１０月２１日　中央人民政府政务院宣告成立。政务院设政治法律委员会、财政经济委员会、文化教育委...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434220.htm
...,...,...,...,...,...
572,2009,８月２５日,８月２５日　胡锦涛出席新疆维吾尔自治区干部大会并发表讲话，强调始终坚持一手抓改革发展，一手抓...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434432.htm
573,2009,９月１４日,９月１４日　胡锦涛等党和国家领导人会见“１００位为新中国成立作出突出贡献的英雄模范人物和１０...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434432.htm
574,2009,９月１５日－１８日,９月１５日－１８日　中共十七届四中全会召开，通过《中共中央关于加强和改进新形势下党的建设若干...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434432.htm
575,2009,９月２０日,９月２０日　胡锦涛等党和国家领导人出席庆祝中国人民政治协商会议成立６０周年大会，胡锦涛在会上...,http://www.gov.cn/test/2009-10/09/content_1434...,content_1434432.htm


In [42]:
words = []

def _parse_event(se):
    idx = se.name
    year = se['year']
    sentence = se['event'].split(maxsplit=1)[1]

    for word in jieba.cut(sentence):
        if len(word.strip()) < 2:
            continue
        words.append(dict(idx=idx, year=year, word=word))

long_table.apply(_parse_event, axis=1)

words = pd.DataFrame(words)
words['pinyin'] = words['word'].map(lambda e: pinyin.get(e, format='strip'))
words.to_csv('words.csv')
words


Unnamed: 0,idx,year,word,pinyin
0,0,1949,中华人民共和国中央人民政府,zhonghuarenmingongheguozhongyangrenminzhengfu
1,0,1949,成立,chengli
2,0,1949,此前,ciqian
3,0,1949,中国人民政治协商会议,zhongguorenminzhengzhixieshanghuiyi
4,0,1949,第一届,diyijie
...,...,...,...,...
12147,576,2009,十国集团,shiguojituan
12148,576,2009,领导人,lingdaoren
12149,576,2009,第三次,disanci
12150,576,2009,金融,jinrong
