In [1]:
import pandas as pd
import numpy as np
import pygtrie as trie
from collections import defaultdict
from scipy.stats import entropy

In [2]:
df = pd.read_csv("output_suffix.txt", sep=' ', header=None, names=["word","cnt","err"])

In [3]:
m = {} # 存储每个候选词的词频
m2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀字集合
for index, row in df.iterrows():
    m[row['word']] = row['cnt']
    if len(row['word']) == 1: continue # 单字词没有前缀字，无需计算
    key = row['word'][1:]
    prefix = row['word'][:1]
    m2[key][prefix] += row['cnt']

### 计算聚合度，聚合度定义为当前串频次与其后缀串频次的比值

In [4]:
def poly(x):
    if len(x['word']) == 1 or x['word'][1:] not in m: return 1 # 单字聚合度为1
    return x['cnt']/m[x['word'][1:]]

### 计算灵活度，灵活度定义为前缀字集合的信息熵

In [5]:
def flex(x):
    if x['word'] not in m2: return 1
    s = pd.Series(m2[x['word']])
    return entropy(s / s.sum())

In [6]:
df['poly'] = df.apply(poly, axis=1)
df['flex'] = df.apply(flex, axis=1)
df['score'] = df.apply(lambda x: x['flex']*x['poly'], axis=1)

In [7]:
d1 = df[(df['score']>1) & (df['err']/df['cnt']<0.5)].sort_values(['score'], ascending=False)

In [8]:
d1[d1['word']=='股份有限公司']

Unnamed: 0,word,cnt,err,poly,flex,score
264,股份有限公司,1685,1,0.999407,1.300101,1.29933


In [9]:
t = trie.CharTrie()
for _, row in d1.iterrows():
    t[row['word'][::-1]] = row

In [10]:
xm2 = defaultdict(lambda: defaultdict(int)) # 存储每个候选词的前缀集合
for _, row in d1.iterrows():
    rword = row['word'][::-1]
    ret = t.prefixes(rword)
    parent = None
    maxlen = -1
    for k, v in ret:
        if k == rword: continue
        if len(k) > maxlen:
            maxlen = len(k)
            parent = k[::-1]
    if parent is not None:
        xm2[parent][row['word']] += row['cnt']

In [11]:
xdf = d1[d1['word'].isin(xm2)]

In [12]:
def xflex(x):
    if x['word'] not in xm2: return 1
    s = pd.Series(xm2[x['word']])
    return entropy(s / s.sum())

In [13]:
pd.options.mode.chained_assignment = None  # default='warn', SettingWithCopyWarning
xdf['xflex'] = xdf.apply(xflex, axis=1)

In [14]:
xdf1 = xdf[xdf['xflex']>0.1]

In [15]:
[row['word'] for _, row in xdf1.iterrows() if not row['word'].endswith('有限公司')]

['店',
 '厂',
 '有限责任公司',
 '经营部',
 '中心',
 '专业合作社',
 '场',
 '站',
 '部',
 '处',
 '商行',
 '室',
 '农民专业合作社']

In [16]:
company_type = set([
    "有限公司",
    "有限责任公司",
    "店",
    "厂",
    "经营部",
    "专业合作社",
])

In [17]:
gtrie = trie.CharTrie()
for word in company_type:
    gtrie[word[::-1]] = word

In [18]:
scope = defaultdict(lambda: set())
types = set()
for _, row in d1.iterrows():
    word = row['word'][::-1]
    key, _ = gtrie.longest_prefix(word)
    if key is not None:
        if key == word: continue
        scope[row['word'].rstrip(key[::-1])].add(key[::-1])
    else:
        types.add(row['word'])                                         

In [19]:
scope_list = [w for w in scope.keys() if w[0] != "("]

In [20]:
scope_list

['贸易',
 '科技',
 '商贸',
 '种植',
 '设备',
 '工程',
 '养殖',
 '物流',
 '投资管理',
 '服务',
 '广告',
 '加工',
 '房地产开发',
 '科技发展',
 '文化传播',
 '销售',
 '电子',
 '运输',
 '电子科技',
 '服装',
 '咨询',
 '设计',
 '投资咨询',
 '技术',
 '发展',
 '开发',
 '制造',
 '生物科技',
 '信息咨询',
 '机电设备',
 '集团',
 '投资',
 '装饰工程',
 '机械',
 '咨询服务',
 '材料',
 '配件',
 '百货',
 '农民',
 '安装工程',
 '金属材料',
 '网络科技',
 '制品',
 '顾问',
 '建材',
 '养殖农民',
 '管理',
 '化工',
 '实业',
 '维修',
 '建筑工程',
 '信息技术',
 '化妆品',
 '种植农民',
 '五金',
 '电器',
 '股份',
 '物业服务',
 '策划',
 '科技(上海)',
 '建筑材料',
 '信息科技',
 '建设',
 '教育咨询',
 '士多',
 '企业管理咨询',
 '连锁',
 '服饰',
 '科技开发',
 '理发',
 '物业管理',
 '百货商',
 '设备制造',
 '制衣',
 '文化传媒',
 '设计制作',
 '加盟',
 '税务师事务所',
 '食用菌种植',
 '会议服务',
 '国际贸易(上海)',
 '办公家具',
 '自来水',
 '不锈钢',
 '建设投资',
 '不锈钢制品',
 '机械设备',
 '五金塑料',
 '租赁',
 '装饰工程设计',
 '食用菌',
 '网络工程',
 '种植养殖',
 '基础工程',
 '玻璃制品',
 '办公设备',
 '再生资源',
 '润滑油',
 '机械配件',
 '人力资源',
 '投资管理咨询',
 '广告装饰',
 '装饰材料',
 '农机服务',
 '日用百货']

In [21]:
type_list = [x for x in types | company_type]

In [22]:
ext_list = [x.lstrip("有限公司") for x in type_list if x.startswith("有限公司") and x != "有限公司"]

In [23]:
ext_list += [x for x in type_list if x[0]=='(']

In [24]:
ext_list

['大连分公司',
 '成都分公司',
 '杭州分公司',
 '沈阳分公司',
 '分公司',
 '第一分公司',
 '(微型企业)',
 '(普通合伙)',
 '(吊销)']

In [25]:
df[df['word']=='(吊销)']

Unnamed: 0,word,cnt,err,poly,flex,score
777,(吊销),552,0,0.786325,1.375065,1.081248


In [26]:
df[df['word']=='股份有限公司']

Unnamed: 0,word,cnt,err,poly,flex,score
264,股份有限公司,1685,1,0.999407,1.300101,1.29933


In [27]:
d1[d1['word']=='股份有限公司']

Unnamed: 0,word,cnt,err,poly,flex,score
264,股份有限公司,1685,1,0.999407,1.300101,1.29933
