In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import codecs
import time

In [2]:
home = 'http://www.chinavitae.com'
institution = 'http://www.chinavitae.com/institution/'
start_html = requests.get(institution)

In [3]:
soup = BeautifulSoup(start_html.text, 'lxml')

In [4]:
""" 
    First layer: 14 categories of institution: 'http://www.chinavitae.com/institution/xxx'
    Find <a> and 'href' attributes 
    Output: a_tag_dic = {link:Category_name}
    A dictionary of 14 elements, each of them is a category of institution
"""
def find_a_tag(soup):
    a_tag = soup.find_all('a')
    a_tag_dic = {}
    for a in a_tag:
        if a.has_attr('href'):
            attr = a['href']
            attr_search = re.compile('/institution/.+')
            attr_str = re.match(attr_search, attr)
            if attr_str:
                a_tag_dic[home+attr_str.group()] = a.text
    return a_tag_dic

In [5]:
layer1_categories = find_a_tag(soup)

In [6]:
"""
    Second layer: 14 categories of institution and their direct children
    'http://www.chinavitae.com/institution/xxx' and 'http://www.chinavitae.com/institution/xxx/xxx'
    
    List of size 14, each element belongs to a institution category.
    Each element is a dictionary of multiple {url: institution_name}
"""
layer2_subCategories = []
for url, val in layer1_categories.iteritems():
    url_html = requests.get(url)
    url_soup = BeautifulSoup(url_html.text, 'lxml')
    layer2_subCategories.append(find_a_tag(url_soup))

In [7]:
layer2_subCategories[0]

{'http://www.chinavitae.com/institution/pc': u"People's Congress at National, Provincial, and Local Levels",
 'http://www.chinavitae.com/institution/pc/1610.': u"People's Congress (NPC)"}

In [8]:
"""
    Third layer: 14 categories of institution, their direct children, and sub-type of their direct children
    'http://www.chinavitae.com/institution/xxx'
    'http://www.chinavitae.com/institution/xxx/xxx'
    'http://www.chinavitae.com/institution/xxx/xxx.xxx'
    
    Fetch all url and name to list of list
"""
start_time = time.time() # Timer
layer3_counter = 0
layer3_sub_subCategories = []
for dic in layer2_subCategories:
    list2 = []
    for url, val in dic.iteritems():
        url_html = requests.get(url)
        url_soup = BeautifulSoup(url_html.text, 'lxml')
        list2.append(find_a_tag(url_soup))
        layer3_counter += 1
    layer3_sub_subCategories.append(list2)
print("--- %s seconds ---" % (time.time() - start_time)) # Print running time

--- 35.3718161583 seconds ---


In [9]:
print layer3_counter
layer3_sub_subCategories[0][0]

107


{'http://www.chinavitae.com/institution/pc': u"People's Congress at National, Provincial, and Local Levels",
 'http://www.chinavitae.com/institution/pc/1610.': u"People's Congress (NPC)"}

In [10]:
# layer4_counter = 0
# layer4_sub_sub_subCategories = []
# for category_layer in layer3_sub_subCategories:
#     tmp = []
#     for sub_category_layer in category_layer:
#         for url, val in sub_category_layer.iteritems():
#             url_html = requests.get(url)
#             url_soup = BeautifulSoup(url_html.text, 'lxml')
#             tmp.append(find_a_tag(url_soup))
#             layer4_counter += 1
#         layer4_sub_sub_subCategories.append(tmp)

In [11]:
# print layer4_counter
# layer4_sub_sub_subCategories

In [12]:
"""
    Get category and id from url
"""
def get_category_and_id_from_url(url):
    url_cat = ''
    url_id = ''
    base = 'http://www.chinavitae.com/institution/'
    piece = url.split('/')

    if piece[-1][0].isdigit():
        url_id = piece[-1]
        url_cat = piece[-2]
    else:
        url_id = url_cat = piece[-1]
    return [url_cat, url_id]

In [16]:
"""
    Write id into files
"""
# everything = layer3_sub_subCategories
myfile = codecs.open('id-list.txt', 'w', 'utf-8')
i = 0
for category in layer3_sub_subCategories: # for each category in `layer3_sub_subCategories`
    for level in category: # for each 
        for url, name in level.iteritems():
            [tmp_cat, tmp_id] = get_category_and_id_from_url(url)
            if tmp_cat != tmp_id:
                myfile.write(tmp_cat + ', ' + tmp_id + ', ' + name.replace(u'—', '-') + '\n')
                i += 1
myfile.close()
print i

9658
