In [1]:
import re
from collections import Counter

# C-like numeric array
import numpy as np

# dealing with table data
import pandas as pd

# from `SongShu.py`
from SongShu import SongShu

# SongShu -- Export Natural Geo-Names (Ê¥≤„ÄÅÂ±± as an example)

Natural geographical names are those names identified by natural scenes, e.g., mountains, rivers. 

In [2]:
# Get the SongShu Text
songshu = SongShu("2018-06-28", "MF")
songshu.load_htmls()

# preprocessing the songshu data to get metadata and bookmarks
# and separate the passages in every pages
songshu.extract_paths()
songshu.extract_meta()
songshu.extract_passages()

[Info] Stop at loading data/ShongShu_0851.html.
[Info] Total length of the data is 851.


## Ê¥≤„ÄÅÂ±± as an Example

We could use regex to list all possible phrases attached with Ê¥≤„ÄÅÂ±±,  
e.g., 'È¨±Ê¥≤', 'ÂêëÈ¨±Ê¥≤', 'Ëµ∞ÂêëÈ¨±Ê¥≤', ...  
and then use frequency to see which one is possible to be natural geographical names.

In [3]:
# Let's build a generator to list all n-gram phrase attached with Ê¥≤ and Â±±

def regexf(char, num):
    return r"[^„ÄÅ„ÄÇÔºåÔºüÔºÅÔºöÔºõ„Äå„Äç„Äî„Äï„Äé„Äè]{" + str(num) + "}" + char

def passageGen():
    for passages in songshu.flat_passages:
        for p in passages:
            yield p
            
def phraseCharGen(char, limits=(1, 4)):
    lower, upper = limits
    for p in passageGen() :
        for i in range(lower, upper):
            for match in re.finditer(regexf(char, i), p):
                yield match.group(0)

In [4]:
Counter(list(phraseCharGen('Ê¥≤', limits=(1, 4)))).most_common(15)

[('Ëî°Ê¥≤', 14),
 ('È¨±Ê¥≤', 9),
 ('Â∂∏Ê¥≤', 6),
 ('Â¥¢Â∂∏Ê¥≤', 6),
 ('Ëá≥Ëî°Ê¥≤', 6),
 ('ÊñºÂ¥¢Â∂∏Ê¥≤', 5),
 ('Ëá™Ëî°Ê¥≤', 5),
 ('Èï∑Ê¥≤', 5),
 ('ËêΩÊ¥≤', 4),
 ('Ê°ëËêΩÊ¥≤', 4),
 ('‰∫îÊ¥≤', 4),
 ('Âæ™Ëá≥Ëî°Ê¥≤', 4),
 ('Ê∫ßÊ¥≤', 3),
 ('Ë≥äËá™Ëî°Ê¥≤', 3),
 ('Â±±Ê¥≤', 3)]

In [5]:
Counter(list(phraseCharGen('Â±±', limits=(1, 5)))).most_common(20)

[('Ê≥∞Â±±', 58),
 ('Ê¢ÅÂ±±', 40),
 ('Á®ΩÂ±±', 28),
 ('ÊúÉÁ®ΩÂ±±', 28),
 ('‰∏≠Â±±', 23),
 ('ÁÇ∫Â±±', 19),
 ('ÈôΩÂ±±', 17),
 ('È´òÂ±±', 17),
 ('ÂêçÂ±±', 16),
 ('ÂçóÂ±±', 15),
 ('Âª¨Â±±', 14),
 ('ÈçæÂ±±', 13),
 ('ÊñºÂ±±', 12),
 ('Âè≤Â±±', 12),
 ('Âà∫Âè≤Â±±', 11),
 ('Â∑ûÂà∫Âè≤Â±±', 11),
 ('ÈÑíÂ±±', 10),
 ('Èô∞Â±±', 10),
 ('‰ª•Â±±', 9),
 ('ÊôØÈôΩÂ±±', 8)]

It seems there are some patterns here... Hmmm 

- We should count the longer phrase as the correct name (if two phrases have the same occurrences)
- We can use correct phrase to search the direction verb (Ëá™„ÄÅËá≥„ÄÅÊñº) üòÅ

## Conditioning on Phrases

In [6]:
# I currently cannot figure out an imperative solution ...
# though functinoal solution here would introduce more iterations ...
def condition(counter):
    exclude_set = set([(p1, n1) for p1,n1 in counter for p2,n2 in counter if (n1 == n2) and (p1 != p2) and (p1 in p2)])
    argmax_set  = set([(p1, n1) for p1,n1 in counter if all([True if p2 not in p1 else False for p2,n2 in counter if (n2 > n1)])])
    return sorted(argmax_set - exclude_set, key=lambda x: x[1], reverse=True)

def condition_verb(counter):
    regex = r"([^„ÄÅ„ÄÇÔºåÔºüÔºÅÔºöÔºõ„Äå„Äç„Äî„Äï„Äé„Äè]{1})" + r"({})".format("|".join(p1 for p1,_ in condition(counter)))
    return {match.group(1) for p,_ in counter for match in re.finditer(regex, p)}

def geonames_and_verbs(char, limits=(1, 4), top_n=15):
    counter = Counter(list(phraseCharGen(char, limits))).most_common(top_n)
    return condition(counter), condition_verb(counter)
    
geonames_and_verbs('Ê¥≤', limits=(1, 4), top_n=15)

([('Ëî°Ê¥≤', 14),
  ('È¨±Ê¥≤', 9),
  ('Â¥¢Â∂∏Ê¥≤', 6),
  ('Èï∑Ê¥≤', 5),
  ('Ê°ëËêΩÊ¥≤', 4),
  ('‰∫îÊ¥≤', 4),
  ('Ê∫ßÊ¥≤', 3),
  ('Â±±Ê¥≤', 3)],
 {'Êñº', 'Ëá™', 'Ëá≥'})

seems ok ...  
so we got geo-names and the verbs attached with geo-names in a tuple

In [7]:
geonames_and_verbs('Â±±', limits=(1, 4), top_n=20)

([('Ê≥∞Â±±', 58),
  ('Ê¢ÅÂ±±', 40),
  ('ÊúÉÁ®ΩÂ±±', 28),
  ('‰∏≠Â±±', 23),
  ('ÁÇ∫Â±±', 19),
  ('ÈôΩÂ±±', 17),
  ('È´òÂ±±', 17),
  ('ÂêçÂ±±', 16),
  ('ÂçóÂ±±', 15),
  ('Âª¨Â±±', 14),
  ('ÈçæÂ±±', 13),
  ('Âè≤Â±±', 12),
  ('ÊñºÂ±±', 12),
  ('ÈÑíÂ±±', 10),
  ('Èô∞Â±±', 10),
  ('‰ª•Â±±', 9)],
 {'Âà∫', 'ÊôØ'})

Something need to be manually exclude: Êñº„ÄÅÁÇ∫„ÄÅ‰ª•

direction verbs finding seem to be failed if the geo-names are wrong.

It is also possible to apply on non-natural-geo-names: Â∑û

In [8]:
geonames_and_verbs('Â∑û', limits=(1, 4), top_n=20)

([('Ë±´Â∑û', 448),
  ('ÂæêÂ∑û', 364),
  ('ËçäÂ∑û', 335),
  ('ÊèöÂ∑û', 322),
  ('ÂÖóÂ∑û', 312),
  ('‰∫åÂ∑û', 274),
  ('ÈõçÂ∑û', 232),
  ('Ê±üÂ∑û', 210),
  ('Âª£Â∑û', 168),
  ('ÈùíÂ∑û', 167),
  ('ÁõäÂ∑û', 150),
  ('ÊπòÂ∑û', 146),
  ('ÈÉ¢Â∑û', 126),
  ('ÂéªÂ∑û', 124),
  ('Âè∏Â∑û', 89),
  ('Ê¢ÅÂ∑û', 85),
  ('ÂÜÄÂ∑û', 82)],
 {'Âçó'})

In [9]:
geonames_and_verbs('ÈÉ°', limits=(1, 4), top_n=20)

([('Ê∞∏ÂàùÈÉ°', 202),
  ('‰∫åÈÉ°', 154),
  ('ÂçóÈÉ°', 137),
  ('Âê≥ÈÉ°', 130),
  ('Â∑ûÈÉ°', 64),
  ('Èô≥ÈÉ°', 49),
  ('Ë´∏ÈÉ°', 48),
  ('Ê¢ÅÈÉ°', 47),
  ('ÈôΩÈÉ°', 45),
  ('‰∏âÈÉ°', 43),
  ('Êù±ÈÉ°', 42),
  ('Ê≤õÈÉ°', 35),
  ('Âπ≥ÈÉ°', 33),
  ('‰∫îÈÉ°', 33),
  ('ËúÄÈÉ°', 29),
  ('È≠èÈÉ°', 29),
  ('ÁÇ∫ÈÉ°', 29),
  ('ÂÆâÈÉ°', 28),
  ('Áß¶ÈÉ°', 27)],
 set())

## Saving to csv

Let's only consider top 15 and limit in (1, 4).

In [10]:
geo_csv = []

# Ê¥≤
geo_names, _ = geonames_and_verbs('Ê¥≤', limits=(1, 4), top_n=15)
geo_names = list(filter(lambda x: re.findall(r"[ÊñºÁÇ∫‰ª•]", x[0]) == [], geo_names))
geo_csv += geo_names

# Â±±
geo_names, _ = geonames_and_verbs('Â±±', limits=(1, 4), top_n=15)
geo_names = list(filter(lambda x: re.findall(r"[ÊñºÁÇ∫‰ª•]", x[0]) == [], geo_names))
geo_csv += geo_names

df = pd.DataFrame(geo_csv, columns=['geonames', 'occurrences'])
df.to_csv('songshu_natural_geonames([Ê¥≤Â±±]).csv')
df

Unnamed: 0,geonames,occurrences
0,Ëî°Ê¥≤,14
1,È¨±Ê¥≤,9
2,Â¥¢Â∂∏Ê¥≤,6
3,Èï∑Ê¥≤,5
4,Ê°ëËêΩÊ¥≤,4
5,‰∫îÊ¥≤,4
6,Ê∫ßÊ¥≤,3
7,Â±±Ê¥≤,3
8,Ê≥∞Â±±,58
9,Ê¢ÅÂ±±,40
