In [110]:
from bs4 import BeautifulSoup
from bs4 import NavigableString
import urllib.request as RequestLib
from string import Template
import re
import sys

In [2]:
def fetch(wiki_url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = RequestLib.Request(wiki_url, headers=headers)
    html = RequestLib.urlopen(req).read()
    return BeautifulSoup(html, 'html.parser')

In [159]:
def extract_birth(bsObj):
    bday = bsObj.select('span.bday')
    birthplace = bsObj.select('div.birthplace')
    if bday:
        bday = bday[0].text
    else:
        bday = bsObj.text
        
    if birthplace:
        birthplace = birthplace[0].text
    else:
        birthplace = None

    return {
        'date': bday,
        'place': birthplace
    }

In [160]:
def extract_death(bsObj):
    dday = bsObj.select('span')
    place = bsObj.select('div.deathplace')
    if dday:
        dday = dday[0].text
    else:
        bday = bsObj.text
        
    if place:
        place = place[0].text
    else:
        place = None

    return {
        'date': dday,
        'place': place
    }

In [114]:
def extract_awards(bsObj, base):
    val = []
    if bsObj.div:
        for li in bsObj.div.ul.select('li'):
            val.append({
                "title": li.a.text,
                "url": base+li.a.attrs['href'],
                'year': li.small.text[1:-1]
            })
    elif len(bsObj.text.split(',')) > 1:
        for e in bsObj.text.split(','):
            v = { 'title': e.strip() }
            year = re.search('\((\d{1,4})\)', e.strip())
            if year:
                v['year'] = year.groups()[0]
            val.append(v)
    else:
        v = ''
        year = None
        for e in bsObj.contents:
            if isinstance(e, NavigableString):
                year = re.search('\((\d{1,4})\)', e.strip())
                if year:
                    year = year.groups()[0]
            elif e.name == 'br':
                v = { 'title': v.strip() }
                if year:
                    v['year'] = year
                val.append(v)
                v = ''
            else:
                v += e.text

    return val

[{'year': '1967', 'title': 'Member of the National Academy of Sciences', 'url': 'x/wiki/Member_of_the_National_Academy_of_Sciences'}, {'year': '1975', 'title': 'Turing Award', 'url': 'x/wiki/Turing_Award'}, {'year': '1978', 'title': 'Nobel Prize in Economics', 'url': 'x/wiki/Nobel_Prize_in_Economics'}, {'year': '1986', 'title': 'National Medal of Science', 'url': 'x/wiki/National_Medal_of_Science'}, {'year': '1987', 'title': 'Harold Pender Award', 'url': 'x/wiki/Harold_Pender_Award'}, {'year': '1988', 'title': 'von Neumann Theory Prize', 'url': 'x/wiki/John_von_Neumann_Theory_Prize'}, {'year': '1969', 'title': 'APA Award for Distinguished Scientific Contributions to Psychology', 'url': 'x/wiki/APA_Award_for_Distinguished_Scientific_Contributions_to_Psychology'}, {'year': '1994', 'title': 'ACM Fellow', 'url': 'x/wiki/ACM_Fellow'}, {'year': '1995', 'title': 'IJCAI Award for Research Excellence', 'url': 'x/wiki/IJCAI_Award_for_Research_Excellence'}]
[{'title': 'Distinguished Scholar of Or

In [161]:
def parse_vcard(vcard):
    rows = vcard.select('tr')
    base = 'https://en.wikipedia.org'

    extract_method = {
        'Citizenship': 'text',
        'Children': 'text',
        'Alma\xa0mater': 'a-list',
        'Known\xa0for': 'a-list',
        'Fields': 'a-list',
        'Institutions': 'a-list',
        'Doctoral advisor': 'a-list',
        'Other\xa0academic advisors': 'a-list',
        'Doctoral students': 'a-list',
        'Influences': 'a-list',
        'Influenced': 'a-list'
    }
    
    name = rows[0].select('div.fn')[0].text
    data = {'name': name }

    for tr in rows:
        if tr.th and tr.th.attrs.get('scope'):
            td = tr.td
            key = tr.th.text.strip()
            method = extract_method.get(key)
            val = ""
            if method == 'a-list':
                val = []
                for a in td.select('a'):
                    if a.text.find('[') != -1:
                        continue
                    val.append({ 'title': a.text, 'url': base+a.attrs['href'] })
            elif method == 'text':
                val = td.text
            elif key == 'Born':
                val = extract_birth(td)
            elif key == 'Died':
                val = extract_death(td)
            elif key == 'Spouse(s)':
                val = []
                for d in td.select('div'):
                    val.append({ 'name': d.text })
            elif key == 'Awards':
                val = extract_awards(td, base)
            else:
                val = td

            data[key.replace('\xa0', ' ')] = val

    return data

In [116]:
def get_people(wiki_url):
    bsObj = fetch(wiki_url)
    title = bsObj.select('h1.firstHeading')[0].text
    biography = [['People_Name', title], ['Wikipedia_url', wiki_url]]

    vcard = parse_vcard(bsObj.select('table.vcard')[0])
    vcard['wiki_url'] = wiki_url
    return vcard

In [164]:
class MyTemplate(Template):
    idpattern = r'[_a-zA-Z][_:.a-zA-Z0-9]*'

template = MyTemplate("""
# ${name}
维基百科地址：[${name}](${wiki_url})
## 时间
### 生卒年月
${Born.date} - ${Died.date}
### 求学经历
${Alma_mater}
## 空间
### 学术领域
${Fields}
### 获奖情况
${Awards}
### 所属机构
${Institutions}
## 变量
### 主要成就
${Known_for}
### 合作关系

### 师承关系
#### 老师
${Doctoral_advisor}
${Other_academic_advisors}
#### 学生
${Doctoral_students}
""")

def render_md(biography):
    data = {}

    for k in biography:
        e = biography[k]
        k = k.replace(' ', '_')
        
        if type(e) is list:
            v = ""
            for a in e:
                if a.get('title'):
                    if a.get('url'):
                        v = v + '* [%s](%s)\n' % (a['title'], a['url'])
                    else:
                        v = v + '* %s\n' % (a['title'])
            data[k] = v
        elif type(e) is dict:
            for l in e:
                data[k+'.'+l] = e[l]
        else:
            data[k] = e
    return template.safe_substitute(data)

In [162]:
simon = get_people('https://en.wikipedia.org/wiki/Herbert_A._Simon')

In [167]:
burt = get_people('https://en.wikipedia.org/wiki/Ronald_Stuart_Burt')

In [168]:
kaneman = get_people('https://en.wikipedia.org/wiki/Daniel_Kahneman')

In [169]:
mccarth = get_people('https://en.wikipedia.org/wiki/John_McCarthy_(computer_scientist)')

In [118]:
people = [simon, burt, kaneman, mccarth]
ignore = ['Citizenship', 'Nationality', 'Born', 'Died', 'Institutions', 'Doctoral students', 'Fields', 'Known for', 
          'wiki_url', 'Website',
          'Doctoral advisor', 'name', 'Awards', 'Thesis',

          'Alma mater', 'Education', 'Residence',
          'Other academic advisors', 'Influenced', 'Influences', 

          'Spouse(s)', 'Children']

for main in people:
    for k in main.keys():
        if k in ignore:
            continue
        for p in people:
            print(p['name'], k, p.get(k))

print(simon)

{'Citizenship': 'United States', 'Known for': [{'title': 'Bounded rationality', 'url': 'https://en.wikipedia.org/wiki/Bounded_rationality'}, {'title': 'Satisficing', 'url': 'https://en.wikipedia.org/wiki/Satisficing'}], 'Awards': [{'year': '1967', 'title': 'Member of the National Academy of Sciences', 'url': 'https://en.wikipedia.org/wiki/Member_of_the_National_Academy_of_Sciences'}, {'year': '1975', 'title': 'Turing Award', 'url': 'https://en.wikipedia.org/wiki/Turing_Award'}, {'year': '1978', 'title': 'Nobel Prize in Economics', 'url': 'https://en.wikipedia.org/wiki/Nobel_Prize_in_Economics'}, {'year': '1986', 'title': 'National Medal of Science', 'url': 'https://en.wikipedia.org/wiki/National_Medal_of_Science'}, {'year': '1987', 'title': 'Harold Pender Award', 'url': 'https://en.wikipedia.org/wiki/Harold_Pender_Award'}, {'year': '1988', 'title': 'von Neumann Theory Prize', 'url': 'https://en.wikipedia.org/wiki/John_von_Neumann_Theory_Prize'}, {'year': '1969', 'title': 'APA Award for

In [170]:
print(render_md(simon))


# Herbert Simon
维基百科地址：[Herbert Simon](https://en.wikipedia.org/wiki/Herbert_A._Simon)
## 时间
### 生卒年月
1916-06-15 - (2001-02-09)
### 求学经历
* [University of Chicago](https://en.wikipedia.org/wiki/University_of_Chicago)

## 空间
### 学术领域
* [Economics](https://en.wikipedia.org/wiki/Economics)
* [Artificial intelligence](https://en.wikipedia.org/wiki/Artificial_intelligence)
* [Computer science](https://en.wikipedia.org/wiki/Computer_science)
* [Political science](https://en.wikipedia.org/wiki/Political_science)

### 获奖情况
* [Member of the National Academy of Sciences](https://en.wikipedia.org/wiki/Member_of_the_National_Academy_of_Sciences)
* [Turing Award](https://en.wikipedia.org/wiki/Turing_Award)
* [Nobel Prize in Economics](https://en.wikipedia.org/wiki/Nobel_Prize_in_Economics)
* [National Medal of Science](https://en.wikipedia.org/wiki/National_Medal_of_Science)
* [Harold Pender Award](https://en.wikipedia.org/wiki/Harold_Pender_Award)
* [von Neumann Theory Prize](https://en.wikipedia.

In [171]:
print(render_md(burt))


# Ronald Stuart Burt
维基百科地址：[Ronald Stuart Burt](https://en.wikipedia.org/wiki/Ronald_Stuart_Burt)
## 时间
### 生卒年月
1949 - ${Died.date}
### 求学经历
* [Johns Hopkins University](https://en.wikipedia.org/wiki/Johns_Hopkins_University)
* [University at Albany, SUNY](https://en.wikipedia.org/wiki/University_at_Albany,_SUNY)
* [University of Chicago](https://en.wikipedia.org/wiki/University_of_Chicago)

## 空间
### 学术领域
* [Mathematical sociology](https://en.wikipedia.org/wiki/Mathematical_sociology)
* [social networks](https://en.wikipedia.org/wiki/Social_network)

### 获奖情况
* Distinguished Scholar of Organizations and Management Theory
* Academy of Management (2007)  Fellow
* American Academy of Arts and Sciences (1993)  Fellow
* Center for Advanced Study in the Behavioral Sciences (1984)

### 所属机构
* [University of Chicago](https://en.wikipedia.org/wiki/University_of_Chicago)
* [Columbia University](https://en.wikipedia.org/wiki/Columbia_University)
* [University of California, Berkeley](https://