In [1]:
from WenXuan import WenXuan
from unicodedata import numeric

# WenXuan -- What happens to the number of separate pieces (765 comparing to 528)

We noticed that the number of pieces in the `wenxuan.flat_bodies` is different from the number 761 listed in the https://en.wikipedia.org/wiki/Wen_Xuan .

## Loading the Wenshuan class

In [2]:
wenxuan = WenXuan('2018-06-08', 'MF')
wenxuan.load_htmls()

[Info] Stop at loading data/wenshuan_0582.html.
[Info] Total length of the data is 582.


In [3]:
wenxuan.extract_paths()         # extract the bookmarks
wenxuan.get_author_bag()        # get the bag of author names and comments
wenxuan.extract_meta()          # extract the meta data
wenxuan.passages2tuples()       # get the passsage into (text, comment) tuples
wenxuan.heads2tuples()          # get headers into (head, comment, ...) tuples
wenxuan.extract_commentators()  # append commentators to metadata
wenxuan.extract_sound_glosses() # append all sound glosses in comments into a list and remove them from the self.flat_passages



## Number of elements in the `paths`

In [4]:
len(wenxuan.paths)

582

It is much less than the number listed in the wiki.

## Counting the num pieces in the paths

One possible hypothesis is that, in Han-Ji, they merged several pieces of works in one page. Therefore, we should count the difference between the actual number in `wenxuan.paths` and number stated in the `bookmark`.

Note: `unicodedata.numeric` could convert Chinese integers into numeric numbers.

In [5]:
# build a string list for chinese integer
chinese_int = ['一','二','三','四','五','六','七','八','九']

# create a dict for str -> int
chinese_int_dict = {}

# I am playing dirty, create a list for decimal numbers 
chinese_decimal = ['十'] + [
    digit + decimal for digit in chinese_int for decimal in ['十']
] 
numerical_decimal = [10] + [
    digit * decimal for digit in range(1, 10) for decimal in [10]
]

# for digit
for digit in chinese_int:
    chinese_int_dict[digit] = int(numeric( digit ))

# for decimal
for decimal,num in zip(chinese_decimal, numerical_decimal):
    chinese_int_dict[decimal] = num
    for digit in chinese_int:
        chinese_int_dict[decimal + digit] = num + int(numeric( digit ))
    
chinese_int_dict

{'一': 1,
 '一十': 10,
 '一十一': 11,
 '一十七': 17,
 '一十三': 13,
 '一十九': 19,
 '一十二': 12,
 '一十五': 15,
 '一十八': 18,
 '一十六': 16,
 '一十四': 14,
 '七': 7,
 '七十': 70,
 '七十一': 71,
 '七十七': 77,
 '七十三': 73,
 '七十九': 79,
 '七十二': 72,
 '七十五': 75,
 '七十八': 78,
 '七十六': 76,
 '七十四': 74,
 '三': 3,
 '三十': 30,
 '三十一': 31,
 '三十七': 37,
 '三十三': 33,
 '三十九': 39,
 '三十二': 32,
 '三十五': 35,
 '三十八': 38,
 '三十六': 36,
 '三十四': 34,
 '九': 9,
 '九十': 90,
 '九十一': 91,
 '九十七': 97,
 '九十三': 93,
 '九十九': 99,
 '九十二': 92,
 '九十五': 95,
 '九十八': 98,
 '九十六': 96,
 '九十四': 94,
 '二': 2,
 '二十': 20,
 '二十一': 21,
 '二十七': 27,
 '二十三': 23,
 '二十九': 29,
 '二十二': 22,
 '二十五': 25,
 '二十八': 28,
 '二十六': 26,
 '二十四': 24,
 '五': 5,
 '五十': 50,
 '五十一': 51,
 '五十七': 57,
 '五十三': 53,
 '五十九': 59,
 '五十二': 52,
 '五十五': 55,
 '五十八': 58,
 '五十六': 56,
 '五十四': 54,
 '八': 8,
 '八十': 80,
 '八十一': 81,
 '八十七': 87,
 '八十三': 83,
 '八十九': 89,
 '八十二': 82,
 '八十五': 85,
 '八十八': 88,
 '八十六': 86,
 '八十四': 84,
 '六': 6,
 '六十': 60,
 '六十一': 61,
 '六十七': 67,
 '六十三': 63,
 '六十九': 69,
 '六十二': 62,
 '六十五': 65,
 '六十八': 68,


## Examining the occurrence of number in the bookmarks



In [6]:
import re

# count the actual number of pieces (number of pages) of works in WenXuan paths
paths_text = ''.join(wenxuan.paths)
paths_dict = {}

for element in set([element for path in wenxuan.paths 
                 for element in path.split('／') 
                 if '首' in element]):
        paths_dict[element] = paths_text.count(element)

In [9]:
# count the difference between number in pages and number in bookmark
difference = 0

for path, num_of_pieces in paths_dict.items():
    # capture the number of pieces in the bookmarks
    for match in re.finditer(r'([一二三四五六七八九十]+?)首', path):
        num_in_path = chinese_int_dict[match.group(1)]
        
    # compage num_of_pieces (in pages) and num_in_path (in bookmark)
    if num_of_pieces != num_in_path:
        print("[Warning] Number of pieces are not match with the number in bookmark.", path, 
              (num_of_pieces, num_in_path))
        difference += num_in_path - num_of_pieces
        
difference



184

In [10]:
len(wenxuan.paths) + difference

766

It turns out very similar to the number in the wiki, so we are ok.