In [1]:
import pandas as pd
import numpy as np
import pygtrie as trie
from collections import defaultdict
from scipy.stats import entropy

In [2]:
df = pd.read_csv("output_suffix.txt", sep=' ', header=None, names=["word","cnt","err"])

In [3]:
m = {} # 存储每个候选词的词频
m2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀字集合
for index, row in df.iterrows():
    m[row['word']] = row['cnt']
    if len(row['word']) == 1: continue # 单字词没有前缀字，无需计算
    key = row['word'][1:]
    prefix = row['word'][:1]
    m2[key][prefix] += row['cnt']

### 计算聚合度，聚合度定义为当前串频次与其后缀串频次的比值

In [4]:
def poly(x):
    if len(x['word']) == 1 or x['word'][1:] not in m: return 1 # 单字聚合度为1
    return x['cnt']/m[x['word'][1:]]

### 计算灵活度，灵活度定义为前缀字集合的信息熵

In [5]:
def flex(x):
    if x['word'] not in m2: return 1
    s = pd.Series(m2[x['word']])
    return entropy(s / s.sum())

In [6]:
df['poly'] = df.apply(poly, axis=1)
df['flex'] = df.apply(flex, axis=1)

In [7]:
df['score'] = df.apply(lambda x: x['flex']*x['poly'], axis=1)

In [8]:
d1 = df[(df['score']>1) & (df['err']==0)].sort_values(['score'], ascending=False)

In [9]:
d1.head()

Unnamed: 0,word,cnt,err,poly,flex,score
13,房地产开发有限公司,4346,0,0.992691,4.614287,4.58056
3,有限公司,84246,0,0.999941,3.932106,3.931873
17,科技有限公司,3990,0,0.999749,3.692177,3.691252
7,有限责任公司,5814,0,1.0,3.572008,3.572008
20,厂,3143,0,1.0,3.476868,3.476868


In [10]:
t = trie.CharTrie()
for _, row in d1.iterrows():
    t[row['word'][::-1]] = row

In [11]:
xm2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀集合
for _, row in d1.iterrows():
    rword = row['word'][::-1]
    ret = t.prefixes(rword)
    parent = None
    maxlen = -1
    for k, v in ret:
        if k == rword: continue
        if len(k) > maxlen:
            maxlen = len(k)
            parent = k[::-1]
    if parent is not None:
        xm2[parent][row['word']] += row['cnt']

In [12]:
xdf = d1[d1['word'].isin(xm2)]

In [13]:
def xflex(x):
    if x['word'] not in xm2: return 1
    s = pd.Series(xm2[x['word']])
    return entropy(s / s.sum())

In [14]:
pd.options.mode.chained_assignment = None  # default='warn', SettingWithCopyWarning
xdf['xflex'] = xdf.apply(xflex, axis=1)

In [15]:
xdf[xdf['xflex']>1]

Unnamed: 0,word,cnt,err,poly,flex,score,xflex
3,有限公司,84246,0,0.999941,3.932106,3.931873,3.579781
17,科技有限公司,3990,0,0.999749,3.692177,3.691252,1.266136
7,有限责任公司,5814,0,1.0,3.572008,3.572008,1.504434
14,工程有限公司,4081,0,1.0,2.640687,2.640687,1.417798
34,管理有限公司,2032,0,0.907548,2.178462,1.977059,1.348465
