In [1]:
import re
from collections import Counter

# C-like numeric array
import numpy as np

# dealing with table data
import pandas as pd

# from `SongShu.py`
from SongShu import SongShu

# SongShu -- Export Natural Geo-Names (洲、山 as an example)

Natural geographical names are those names identified by natural scenes, e.g., mountains, rivers. 

In [2]:
# Get the SongShu Text
songshu = SongShu("2018-06-28", "MF")
songshu.load_htmls()

# preprocessing the songshu data to get metadata and bookmarks
# and separate the passages in every pages
songshu.extract_paths()
songshu.extract_meta()
songshu.extract_passages()

[Info] Stop at loading data/ShongShu_0851.html.
[Info] Total length of the data is 851.


## 洲、山 as an Example

We could use regex to list all possible phrases attached with 洲、山,  
e.g., '鬱洲', '向鬱洲', '走向鬱洲', ...  
and then use frequency to see which one is possible to be natural geographical names.

In [3]:
# Let's build a generator to list all n-gram phrase attached with 洲 and 山

def regexf(char, num):
    return r"[^、。，？！：；「」〔〕『』]{" + str(num) + "}" + char

def passageGen():
    for passages in songshu.flat_passages:
        for p in passages:
            yield p
            
def phraseCharGen(char, limits=(1, 4)):
    lower, upper = limits
    for p in passageGen() :
        for i in range(lower, upper):
            for match in re.finditer(regexf(char, i), p):
                yield match.group(0)

In [4]:
Counter(list(phraseCharGen('洲', limits=(1, 4)))).most_common(15)

[('蔡洲', 14),
 ('鬱洲', 9),
 ('嶸洲', 6),
 ('崢嶸洲', 6),
 ('至蔡洲', 6),
 ('於崢嶸洲', 5),
 ('自蔡洲', 5),
 ('長洲', 5),
 ('落洲', 4),
 ('桑落洲', 4),
 ('五洲', 4),
 ('循至蔡洲', 4),
 ('溧洲', 3),
 ('賊自蔡洲', 3),
 ('山洲', 3)]

In [5]:
Counter(list(phraseCharGen('山', limits=(1, 5)))).most_common(20)

[('泰山', 58),
 ('梁山', 40),
 ('稽山', 28),
 ('會稽山', 28),
 ('中山', 23),
 ('為山', 19),
 ('陽山', 17),
 ('高山', 17),
 ('名山', 16),
 ('南山', 15),
 ('廬山', 14),
 ('鍾山', 13),
 ('於山', 12),
 ('史山', 12),
 ('刺史山', 11),
 ('州刺史山', 11),
 ('鄒山', 10),
 ('陰山', 10),
 ('以山', 9),
 ('景陽山', 8)]

It seems there are some patterns here... Hmmm 

- We should count the longer phrase as the correct name (if two phrases have the same occurrences)
- We can use correct phrase to search the direction verb (自、至、於) 😁

## Conditioning on Phrases

In [6]:
# I currently cannot figure out an imperative solution ...
# though functinoal solution here would introduce more iterations ...
def condition(counter):
    exclude_set = set([(p1, n1) for p1,n1 in counter for p2,n2 in counter if (n1 == n2) and (p1 != p2) and (p1 in p2)])
    argmax_set  = set([(p1, n1) for p1,n1 in counter if all([True if p2 not in p1 else False for p2,n2 in counter if (n2 > n1)])])
    return sorted(argmax_set - exclude_set, key=lambda x: x[1], reverse=True)

def condition_verb(counter):
    regex = r"([^、。，？！：；「」〔〕『』]{1})" + r"({})".format("|".join(p1 for p1,_ in condition(counter)))
    return {match.group(1) for p,_ in counter for match in re.finditer(regex, p)}

def geonames_and_verbs(char, limits=(1, 4), top_n=15):
    counter = Counter(list(phraseCharGen(char, limits))).most_common(top_n)
    return condition(counter), condition_verb(counter)
    
geonames_and_verbs('洲', limits=(1, 4), top_n=15)

([('蔡洲', 14),
  ('鬱洲', 9),
  ('崢嶸洲', 6),
  ('長洲', 5),
  ('桑落洲', 4),
  ('五洲', 4),
  ('溧洲', 3),
  ('山洲', 3)],
 {'於', '自', '至'})

seems ok ...  
so we got geo-names and the verbs attached with geo-names in a tuple

In [7]:
geonames_and_verbs('山', limits=(1, 4), top_n=20)

([('泰山', 58),
  ('梁山', 40),
  ('會稽山', 28),
  ('中山', 23),
  ('為山', 19),
  ('陽山', 17),
  ('高山', 17),
  ('名山', 16),
  ('南山', 15),
  ('廬山', 14),
  ('鍾山', 13),
  ('史山', 12),
  ('於山', 12),
  ('鄒山', 10),
  ('陰山', 10),
  ('以山', 9)],
 {'刺', '景'})

Something need to be manually exclude: 於、為、以

direction verbs finding seem to be failed if the geo-names are wrong.

It is also possible to apply on non-natural-geo-names: 州

In [8]:
geonames_and_verbs('州', limits=(1, 4), top_n=20)

([('豫州', 448),
  ('徐州', 364),
  ('荊州', 335),
  ('揚州', 322),
  ('兗州', 312),
  ('二州', 274),
  ('雍州', 232),
  ('江州', 210),
  ('廣州', 168),
  ('青州', 167),
  ('益州', 150),
  ('湘州', 146),
  ('郢州', 126),
  ('去州', 124),
  ('司州', 89),
  ('梁州', 85),
  ('冀州', 82)],
 {'南'})

In [9]:
geonames_and_verbs('郡', limits=(1, 4), top_n=20)

([('永初郡', 202),
  ('二郡', 154),
  ('南郡', 137),
  ('吳郡', 130),
  ('州郡', 64),
  ('陳郡', 49),
  ('諸郡', 48),
  ('梁郡', 47),
  ('陽郡', 45),
  ('三郡', 43),
  ('東郡', 42),
  ('沛郡', 35),
  ('平郡', 33),
  ('五郡', 33),
  ('蜀郡', 29),
  ('魏郡', 29),
  ('為郡', 29),
  ('安郡', 28),
  ('秦郡', 27)],
 set())

## Saving to csv

Let's only consider top 15 and limit in (1, 4).

In [10]:
geo_csv = []

# 洲
geo_names, _ = geonames_and_verbs('洲', limits=(1, 4), top_n=15)
geo_names = list(filter(lambda x: re.findall(r"[於為以]", x[0]) == [], geo_names))
geo_csv += geo_names

# 山
geo_names, _ = geonames_and_verbs('山', limits=(1, 4), top_n=15)
geo_names = list(filter(lambda x: re.findall(r"[於為以]", x[0]) == [], geo_names))
geo_csv += geo_names

df = pd.DataFrame(geo_csv, columns=['geonames', 'occurrences'])
df.to_csv('songshu_natural_geonames([洲山]).csv')
df

Unnamed: 0,geonames,occurrences
0,蔡洲,14
1,鬱洲,9
2,崢嶸洲,6
3,長洲,5
4,桑落洲,4
5,五洲,4
6,溧洲,3
7,山洲,3
8,泰山,58
9,梁山,40
