In [1]:
# %load_ext autoreload
# %autoreload 2
import pickle
from collections import Counter
import pandas as pd
import csv
import json

rpath = '../data/raw/'
wpath = '../data/tyc/'

### company 信息抽取
* 股东信息: company-human, company-company
* 对外投资: company-company
* 竞品信息: company-company
* 供应商: company-company
* 历史股东: company-human, company-company

产出：
* comp_info.csv：公司数值和分类信息
* comp_description.txt：公司分好词的描述文本
* comp_gdxx_comp.csv：股东信息，['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'date']
* comp_dwtz_comp.csv：对外投资
* comp_jpxx_comp.csv：竞品信息
* comp_gys_comp.csv：供应商
* comp_lsgd_comp.csv：历史股东

In [2]:
# 所有爬过的公司

comp_dict = pickle.load(open(wpath + 'comp_dict.pkl', 'rb'))

In [3]:
comp_info = {}
comp_description = {}
comp_gudong = []
comp_touzi = []
comp_jingpin = []
comp_gongying = []
comp_lsgudong = []

for line in open('../data/raw/company_total.json', encoding='utf-8'):
    line = json.loads(line)

    comp_id = line['id']

    comp_description[comp_id] = line['简介']

    comp_info[comp_id] = {}
    comp_info[comp_id]['cname'] = line['名称']
    comp_info[comp_id]['address'] = line['地址']
    comp_info[comp_id]['industry'] = []
    for d in line.get('企业业务', []):
        if '标签' not in d:
            continue
        comp_info[comp_id]['industry'].append(d['标签'])

    for d in line.get('股东信息', []):
        if 'CompanyID' not in d or d['CompanyID'] not in comp_dict:
            continue
        comp_gudong.append((comp_id, d['CompanyID'], d['持股比例'], d['日期']))

    for d in line.get('对外投资', []):
        if 'CompanyID' not in d or d['CompanyID'] not in comp_dict:
            continue
        comp_touzi.append((comp_id, d['CompanyID'], d['投资比例'], d['经营状态'], d['日期']))

    for d in line.get('竞品信息', []):
        if 'CompanyID' not in d or d['CompanyID'] not in comp_dict:
            continue
        comp_jingpin.append((comp_id, d['CompanyID'], d['产品标签'], d['日期']))

    for d in line.get('供应商', []):
        if 'CompanyID' not in d or d['CompanyID'] not in comp_dict:
            continue
        comp_gongying.append((comp_id, d['CompanyID'], d['日期']))

    for d in line.get('历史股东', []):
        if 'CompanyID' not in d or d['CompanyID'] not in comp_dict:
            continue
        comp_lsgudong.append((comp_id, d['CompanyID'], d['持股比例'], d['日期']))

In [4]:
comp_graph = {'股东信息 comp_gudong': comp_gudong, '历史股东 comp_lsgudong': comp_lsgudong, '供应商 comp_gongying': comp_gongying, '竞品信息 comp_jingpin': comp_jingpin, '对外投资 comp_touzi': comp_touzi}

for k, v in comp_graph.items():
    print('#', k, len(v))

print(f'# 公司基本信息 comp_info', len(comp_info))
print(f'# 公司描述 comp_description', len(comp_description))
print('# total', sum([len(v) for v in comp_graph.values()]))

# 股东信息 comp_gudong 38978
# 历史股东 comp_lsgudong 15120
# 供应商 comp_gongying 25354
# 竞品信息 comp_jingpin 116955
# 对外投资 comp_touzi 27628
# 公司基本信息 comp_info 92524
# 公司描述 comp_description 92524
# total 224035


给全部公司节点编号，顺序为：投资关系公司，其余公司按节点度数从大到小。

In [5]:
comps = pd.read_csv(wpath + 'comps.csv')

In [6]:
src, dst = [], []
for g in comp_graph.values():
    src_list, dst_list, *_ = zip(*g)
    src.extend(src_list)
    dst.extend(dst_list)

# [(k1, v1), (k2, v2), ..., (kn, vn)]
nodes = src + dst
nodes_count = sorted(Counter(nodes).items(), key=lambda x: x[1], reverse=True)

In [7]:
len(nodes_count), len(src), len(dst), len(nodes)

(67388, 224035, 224035, 448070)

In [8]:
comp_set = set(comps.cid)
filtered_nodes = [n for n, _ in nodes_count if n not in comp_set]

comp_list = list(comps.cid) + filtered_nodes
comp_ind = {v: k for k, v in enumerate(comp_list)}

In [9]:
print('# nodes', len(comp_list))

# nodes 75140


保存所有公司标号、url和名称的对应：

In [10]:
df = pd.DataFrame({ 'cid': comp_list, 'cname': [comp_dict[c] for c in comp_list] })
df.to_csv(wpath + 'comps_total.csv', index=False)

保存公司基本信息：

In [11]:
comp_info['company/51012764']

{'cname': '青之白（北京）投资管理有限公司',
 'address': '北京市东城区板桥南巷7号北楼(9号楼)一层西侧甲1号',
 'industry': ['企业服务', '社交社区']}

In [12]:
comp_info_df = pd.DataFrame({
    'cid': comp_list,
    'cname': [comp_dict[cid] for cid in comp_list],
    'address': [comp_info[cid]['address'] for cid in comp_list],
    'industry': [':'.join(comp_info[cid]['industry']) for cid in comp_list]
})

comp_info_df.to_csv(wpath + 'comp_info.csv', index=False)

保存分好词的公司描述文本：

In [13]:
import jieba

jieba.enable_paddle()
stopwords = set(line.strip() for line in open('../data/hit_stopwords.txt', encoding='utf-8'))

with open(wpath + 'comp_description.txt', 'w', encoding='utf-8') as f:
    for cid in comp_list:
        if comp_description[cid].strip() in ['暂无信息', '收起']:
            print(file=f)
            continue
        desc = comp_description[cid].strip().replace('\n', ' ').replace('\t', ' ')
        segs = [c for c in jieba.cut(desc) if c and c not in stopwords]
        print(' '.join(segs), file=f)

Paddle enabled successfully......
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/45/mgt8rsp92j72ld6w0xzzf5wm0000gn/T/jieba.cache
Loading model cost 1.058 seconds.
Prefix dict has been built successfully.


保存每种关系图:

股东信息：

In [14]:
comp_gudong[1]

('company/34111476', 'company/644426904', '6.72%', '2016-01-15')

In [15]:
# src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
with open(wpath + 'comp_gudong_comp.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'ratio', 'date'])
    for src_cid, dst_cid, ratio, date in comp_gudong:
        writer.writerow([comp_ind[src_cid], src_cid, comp_dict[src_cid], comp_ind[dst_cid], dst_cid, comp_dict[dst_cid], ratio, date])

历史股东：

In [16]:
comp_lsgudong[10]

('company/151363476', 'company/167853964', '-', '2018-05-07')

In [17]:
# src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
with open(wpath + 'comp_lsgudong_comp.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'ratio', 'date'])
    for src_cid, dst_cid, ratio, date in comp_lsgudong:
        writer.writerow([comp_ind[src_cid], src_cid, comp_dict[src_cid], comp_ind[dst_cid], dst_cid, comp_dict[dst_cid], ratio, date])

对外投资：

In [18]:
comp_touzi[1]

('company/98664648', 'company/160072541', '100%', '存续', '2003-09-22')

In [19]:
# src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
with open(wpath + 'comp_dwtz_comp.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'ratio', 'status', 'date'])
    for src_cid, dst_cid, ratio, status, date in comp_touzi:
        writer.writerow([comp_ind[src_cid], src_cid, comp_dict[src_cid], comp_ind[dst_cid], dst_cid, comp_dict[dst_cid], ratio, status, date])

竞品信息：

In [20]:
comp_jingpin[1]

('company/34111476', 'company/112622245', '文娱传媒', '2014-11-25')

In [21]:
# src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
with open(wpath + 'comp_jingpin_comp.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'industry', 'date'])
    for src_cid, dst_cid, industry, date in comp_jingpin:
        writer.writerow([comp_ind[src_cid], src_cid, comp_dict[src_cid], comp_ind[dst_cid], dst_cid, comp_dict[dst_cid], industry, date])

供应商：

In [22]:
comp_gongying[1]

('company/3346731739', 'company/2347149784', '2019-04-29')

In [23]:
# src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
with open(wpath + 'comp_gongying_comp.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['src_ind', 'src_cid', 'src_cname', 'dst_ind', 'dst_cid', 'dst_cname', 'date'])
    for src_cid, dst_cid, date in comp_gongying:
        writer.writerow([comp_ind[src_cid], src_cid, comp_dict[src_cid], comp_ind[dst_cid], dst_cid, comp_dict[dst_cid], date])