In [1]:
import pandas as pd
import numpy as np
import pygtrie as trie
from collections import defaultdict
from scipy.stats import entropy

In [2]:
df = pd.read_csv("output_suffix.txt", sep=' ', header=None, names=["word","cnt","err"])

In [3]:
m = {} # 存储每个候选词的词频
m2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀字集合
for index, row in df.iterrows():
    m[row['word']] = row['cnt']
    if len(row['word']) == 1: continue # 单字词没有前缀字，无需计算
    key = row['word'][1:]
    prefix = row['word'][:1]
    m2[key][prefix] += row['cnt']

### 计算聚合度，聚合度定义为当前串频次与其后缀串频次的比值

In [4]:
def poly(x):
    if len(x['word']) == 1 or x['word'][1:] not in m: return 1 # 单字聚合度为1
    return x['cnt']/m[x['word'][1:]]

### 计算灵活度，灵活度定义为前缀字集合的信息熵

In [5]:
def flex(x):
    if x['word'] not in m2: return 1
    s = pd.Series(m2[x['word']])
    return entropy(s / s.sum())

In [6]:
df['poly'] = df.apply(poly, axis=1)
df['flex'] = df.apply(flex, axis=1)

In [7]:
df['score'] = df.apply(lambda x: x['flex']*x['poly'], axis=1)

In [8]:
d1 = df[(df['score']>1) & (df['err']==0)].sort_values(['score'], ascending=False)

In [9]:
d1.head()

Unnamed: 0,word,cnt,err,poly,flex,score
13,贸易有限公司,33950,0,0.994755,4.483247,4.459734
8,科技有限公司,42085,0,0.999976,3.852677,3.852585
3,有限公司,466750,0,0.999989,3.840546,3.840505
4,店,124101,0,1.0,3.71692,3.71692
6,厂,62011,0,1.0,3.690329,3.690329


In [10]:
t = trie.CharTrie()
for _, row in d1.iterrows():
    t[row['word'][::-1]] = row

In [11]:
xm2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀集合
for _, row in d1.iterrows():
    rword = row['word'][::-1]
    ret = t.prefixes(rword)
    parent = None
    maxlen = -1
    for k, v in ret:
        if k == rword: continue
        if len(k) > maxlen:
            maxlen = len(k)
            parent = k[::-1]
    if parent is not None:
        xm2[parent][row['word']] += row['cnt']

In [12]:
xdf = d1[d1['word'].isin(xm2)]

In [13]:
def xflex(x):
    if x['word'] not in xm2: return 1
    s = pd.Series(xm2[x['word']])
    return entropy(s / s.sum())

In [14]:
pd.options.mode.chained_assignment = None  # default='warn', SettingWithCopyWarning
xdf['xflex'] = xdf.apply(xflex, axis=1)

In [15]:
company_type = set([
    "有限公司",
    "有限责任公司",
    "店",
    "厂",
    "经营部",
    "专业合作社",
])

In [16]:
gtrie = trie.CharTrie()
for word in company_type:
    gtrie[word[::-1]] = word

In [17]:
scope = defaultdict(lambda: set())
types = set()
for _, row in d1.iterrows():
    word = row['word'][::-1]
    key, _ = gtrie.longest_prefix(word)
    if key is not None:
        if key == word: continue
        scope[row['word'].rstrip(key[::-1])].add(key[::-1])
    else:
        types.add(row['word'])                                         

In [18]:
scope_list = [w for w in scope.keys() if w[0] != "("]

In [19]:
scope_list

['贸易',
 '科技',
 '商贸',
 '种植',
 '设备',
 '工程',
 '养殖',
 '物流',
 '服务',
 '投资管理',
 '广告',
 '房地产开发',
 '加工',
 '科技发展',
 '销售',
 '文化传播',
 '运输',
 '服装',
 '咨询',
 '设计',
 '电子',
 '投资咨询',
 '技术',
 '发展',
 '电子科技',
 '开发',
 '制造',
 '生物科技',
 '信息咨询',
 '机电设备',
 '集团',
 '投资',
 '装饰工程',
 '机械',
 '咨询服务',
 '材料',
 '配件',
 '百货',
 '金属材料',
 '建筑工程',
 '网络科技',
 '制品',
 '顾问',
 '建材',
 '养殖农民',
 '农民',
 '管理',
 '安装工程',
 '维修',
 '实业',
 '信息技术',
 '化工',
 '电器',
 '策划',
 '电子商务',
 '建筑材料',
 '信息科技',
 '建设',
 '企业管理咨询',
 '连锁',
 '科技开发',
 '理发',
 '物业管理',
 '制衣',
 '租赁']

In [20]:
type_list = [x for x in types | company_type]

In [21]:
type_list

['服务部',
 '种植场',
 '经销处',
 '网吧',
 '柜',
 '档',
 '工程公司',
 '维修部',
 '(吊销)',
 '销售部',
 '场',
 '服务中心',
 '专业合作社',
 '开发公司',
 '站',
 '部',
 '摊',
 '所',
 '(普通合伙)',
 '(微型企业)',
 '有限公司分公司',
 '中心',
 '行',
 '庄',
 '经营部',
 '家庭农场',
 '猪肉档',
 '处',
 '服务站',
 '加油站',
 '园',
 '营业厅',
 '厅',
 '营业部',
 '贸易部',
 '办事处',
 '厂',
 '屋',
 '室',
 '商行',
 '有限责任公司',
 '贸易商行',
 '服务公司',
 '门市',
 '门市部',
 '代表处',
 '有限公司',
 '工作室',
 '店',
 '馆',
 '养殖场',
 '院',
 '经销部']

In [22]:
df[df['word']=='(吊销)']

Unnamed: 0,word,cnt,err,poly,flex,score
777,(吊销),552,0,0.786325,1.375065,1.081248
