In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import codecs

In [2]:
home = 'http://www.chinavitae.com'
institution = 'http://www.chinavitae.com/institution/'
start_html = requests.get(institution)

In [3]:
soup = BeautifulSoup(start_html.text, 'lxml')

In [18]:
""" 
    First layer: 14 categories of institution: 'http://www.chinavitae.com/institution/xxx'
    Find <a> and 'href' attributes 
    Output: a_tag_dic = {link:Category_name}
    A dictionary of 14 elements, each of them is a category of institution
"""
def find_a_tag(soup):
    a_tag = soup.find_all('a')
    a_tag_dic = {}
    for a in a_tag:
        if a.has_attr('href'):
            attr = a['href']
            attr_search = re.compile('/institution/.+')
            attr_str = re.match(attr_search, attr)
            if attr_str:
                a_tag_dic[home+attr_str.group()] = a.text
    return a_tag_dic

In [19]:
categories = find_a_tag(soup)

In [20]:
"""
    Second layer: 14 categories of institution and their direct children
    'http://www.chinavitae.com/institution/xxx' and 'http://www.chinavitae.com/institution/xxx/xxx'
    
    List of size 14, each element belongs to a institution category.
    Each element is a dictionary of multiple {url: institution_name}
"""
layer2 = []
for url, val in categories.iteritems():
    url_html = requests.get(url)
    url_soup = BeautifulSoup(url_html.text, 'lxml')
    layer2.append(find_a_tag(url_soup))

In [21]:
layer2[0]

{'http://www.chinavitae.com/institution/pc': u"People's Congress at National, Provincial, and Local Levels",
 'http://www.chinavitae.com/institution/pc/1610.': u"People's Congress (NPC)"}

In [8]:
"""
    Third layer: 14 categories of institution, their direct children, and sub-type of their direct children
    'http://www.chinavitae.com/institution/xxx'
    'http://www.chinavitae.com/institution/xxx/xxx'
    'http://www.chinavitae.com/institution/xxx/xxx.xxx'
    
    Fetch all url and name to list of list
"""
layer3 = []
for dic in layer2:
    list2 = []
    for url, val in dic.iteritems():
        url_html = requests.get(url)
        url_soup = BeautifulSoup(url_html.text, 'lxml')
        list2.append(find_a_tag(url_soup))
    layer3.append(list2)

In [9]:
"""
    Get category and id from url
"""
def get_category_and_id_from_url(url):
    url_cat = ''
    url_id = ''
    base = 'http://www.chinavitae.com/institution/'
    piece = url.split('/')

    if piece[-1][0].isdigit():
        url_id = piece[-1]
        url_cat = piece[-2]
    else:
        url_id = url_cat = piece[-1]
    return [url_cat, url_id]

In [22]:
list3[0]

[{'http://www.chinavitae.com/institution/pc': u"People's Congress at National, Provincial, and Local Levels",
  'http://www.chinavitae.com/institution/pc/1610.': u"People's Congress (NPC)"},
 {'http://www.chinavitae.com/institution/pc': u"People's Congress at National, Provincial, and Local Levels",
  'http://www.chinavitae.com/institution/pc/1610.': u"People's Congress (NPC)",
  'http://www.chinavitae.com/institution/pc/1610.001': u'NPC',
  'http://www.chinavitae.com/institution/pc/1610.101': u'Standing Committee of the NPC',
  'http://www.chinavitae.com/institution/pc/1610.101001': u'Standing Committee of the NPC\u2014General Office',
  'http://www.chinavitae.com/institution/pc/1610.101001009': u'Standing Committee of the NPC\u2014General Office\u2014Research Center',
  'http://www.chinavitae.com/institution/pc/1610.201': u'Ethnic Affairs Committee of the NPC',
  'http://www.chinavitae.com/institution/pc/1610.202': u'Internal and Judicial Affairs Committee of the NPC',
  'http://www.

In [94]:
"""
    Write id into files
"""
everything = list3
myfile = codecs.open('id-list.txt', 'w', 'utf-8')
i = 0
for category in list3:
    for level in category:
        for url, name in level.iteritems():
            [tmp_cat, tmp_id] = get_category_and_id_from_url(url)
            myfile.write(tmp_cat + ', ' + tmp_id + ', ' + name + '\n')
            i += 1
myfile.close()
print i

9765
