# 金融 证券 图谱搭建

In [15]:
import pandas as pd
import hashlib

In [16]:
def get_md5(string):
    """
    Get md5 according to the string
    """
    byte_string = string.encode("utf-8")
    md5 = hashlib.md5()
    md5.update(byte_string)
    result = md5.hexdigest()
    return result

In [17]:
def filter_update_data(filename,path_origin,path_new,labels,keys,names,types = None):
    """
    筛选需要的节点和关系数据
    """
    data_origin = pd.read_csv(path_origin + filename,header=None,names=names,dtype=str)
    data_new = pd.read_csv(path_new + filename,header=None,names=names,dtype=str)
    
    
    # 拼接数据
    if types == 'rel_manager':
        # 主键 hash 提高拼接速度
        data_origin['hash_key'] = data_origin.apply(lambda x:get_md5('{}-{}-{}-{}'.format(x['fullname'],x['hash_cust'], x['title'],x['begin_date'])),axis=1)
        data_new['hash_key'] = data_new.apply(lambda x:get_md5('{}-{}-{}-{}'.format(x['fullname'], x['hash_cust'],x['title'],x['begin_date'])),axis=1)
        data = pd.merge(data_new,data_origin,on = ['hash_key'],how = 'left')
        data = data.rename(columns={'fullname_x':'fullname','hash_cust_x':'hash_cust','title_x':'title','begin_date_x':'begin_date'})
    else:
        data = pd.merge(data_new,data_origin,on = keys,how = 'left')
    
    names1 = list(set(names) - set(keys))
    
    for c in keys:
        data = data[data[c].notnull()]

    for c in names:
        if ( c not in keys)&(c != labels):
            # print(c)
            data[c] = data.apply(lambda x: x[c+'_x'] if x[c+'_x']!=x[c+'_y'] else None ,axis = 1)
    data[labels] = data[labels + '_x']
    data['tags'] =  data.apply(lambda x: sum([1 for t in names1 if pd.notnull(x[t])]) ,axis = 1) 

    te = data.loc[data['tags'] != 1 ,names].drop_duplicates().fillna("NotChange")
    
    # 保存数据
    te.to_csv(out_path+filename, index =False)
    print(te.shape)
    print('-'*20)
    print(data.tags.value_counts())
    return te

## 数据处理

In [18]:
out_path = './data_update/gap/'

path_origin = './data/output/'

path_new = './data_update/output/'

### 公司
合并 处理 基金管理人\ 基金托管人\ 上市公司\

In [22]:
filename = 'node_companies.csv'

In [23]:
labels = 'label'
keys = ['fullname']

In [24]:
names = ['fullname', 'name', 'symbol', 'market', 'exchange', 'list_status', 'list_date',
         'delist_date', 'setup_date', 'label']

In [25]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(49, 10)
--------------------
1    18955
8       37
2        7
3        5
Name: tags, dtype: int64


### 节点：行业

In [26]:
filename = 'node_industries.csv'

In [27]:
labels = 'label'
keys = ['industry']

In [28]:
names = ['industry', 'label']

In [29]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 2)
--------------------
1    110
Name: tags, dtype: int64


### 关系：股票-->行业

In [30]:
filename = 'rel_share_in_industry.csv'

In [31]:
labels = 'type'
keys = ['fullname', 'industry']

In [32]:
names = ['fullname', 'industry', 'type']

In [33]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 3)
--------------------
1    3766
Name: tags, dtype: int64


### 节点：省份

In [34]:
filename = 'node_province.csv'

In [35]:
labels = 'label'
keys = ['province']

In [36]:
names = ['province', 'label']

In [37]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 2)
--------------------
1    32
Name: tags, dtype: int64


### 节点：城市

In [38]:
filename = 'node_city.csv'

In [39]:
labels = 'label'
keys = ['city']

In [40]:
names = ['city', 'label']

In [41]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 2)
--------------------
1    341
Name: tags, dtype: int64


### 关系：公司-->城市

In [42]:
filename = 'rel_company_in_city.csv'

In [43]:
labels = 'type'
keys = ['fullname', 'city']

In [44]:
names = ['fullname', 'city', 'type']

In [45]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 3)
--------------------
1    18964
Name: tags, dtype: int64


### 关系：城市-->省份

In [46]:
filename = 'rel_city_in_province.csv'

In [47]:
keys = ['city', 'province']

In [48]:
labels = 'type'
names = ['city', 'province', 'type']

In [49]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 3)
--------------------
1    341
Name: tags, dtype: int64


### 节点：人（上市公司董事高管）

In [50]:
filename = 'node_managers.csv'

In [51]:
labels = 'label'
keys = ['hash_cust']

In [52]:
names = ['hash_cust', 'name', 'gender', 'edu', 'national', 'birthday', 'label']

In [53]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 7)
--------------------
1    163613
Name: tags, dtype: int64


### 关系：公司（上市公司）--> 人(董事高管)

对于 公司与 高管之间的关系，不同与上述其他节点之间关系，会存在多边的情况存在。因为一个管理者在一个任期中，会存在以下两种情况
- 任期结束--> 更新结束时间
- 职位调整 --> 更新结束时间 新建一条边。

针对，复杂性对于这部分情况的数据更新需要单独处理。

即将通过公司（company）、人（manger_id）、职位(title)、上任时间（start_date）四个变量去定位一条边。



![](pictures/4b9ce4c5-c9ca-4e10-ba98-6deced93d145.png)

In [85]:
filename = 'rel_listed_company_has_manager.csv'

In [86]:
labels = 'lev'
keys = ['fullname','hash_cust','title','begin_date']

In [87]:
names = ['fullname', 'hash_cust', 'begin_date', 'end_date', 'title', 'lev']

In [88]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names, types='rel_manager')

(1251, 6)
--------------------
1    429847
2      1251
Name: tags, dtype: int64


In [89]:
te.head()

Unnamed: 0,fullname,hash_cust,begin_date,end_date,title,lev
5240,首航高科能源技术股份有限公司,f756ce1cd2959c31aefbf00613ce741b,20101214,20131107,薪酬与考核委员会委员,其他
5241,首航高科能源技术股份有限公司,f756ce1cd2959c31aefbf00613ce741b,20101214,20131107,薪酬与考核委员会主任,其他
5242,首航高科能源技术股份有限公司,f756ce1cd2959c31aefbf00613ce741b,20101214,20131107,独立董事,董事会成员
5243,首航高科能源技术股份有限公司,f756ce1cd2959c31aefbf00613ce741b,20101214,20131107,提名委员会委员,其他
5244,首航高科能源技术股份有限公司,f756ce1cd2959c31aefbf00613ce741b,20101214,20131107,战略委员会委员,其他


### 节点：基金

In [58]:
filename = 'node_funds.csv'

In [59]:
labels = 'label'
keys = ['fund']

In [60]:
names = ['fund', 'name', 'fund_type', 'invest_type', 'type', 'benchmark', 'market', 
         'found_date', 'delist_date', 'status', 'label']

In [61]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(378, 11)
--------------------
1     10658
10      186
3       139
2        37
4        11
6         5
Name: tags, dtype: int64


### 关系: 基金--->托管人 

In [91]:
filename = 'rel_fund_has_custodian.csv'

In [92]:
labels = 'type'
keys = ['fund', 'fullname']

In [93]:
names = ['fund', 'fullname', 'type']

In [94]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 3)
--------------------
1    11100
Name: tags, dtype: int64


In [95]:
te.shape

(0, 3)

### 关系: 公募基金 --> 管理人

In [96]:
filename = 'rel_fund_has_management.csv'

In [97]:
labels = 'type'
keys = ['fund', 'fullname']

In [98]:
names = ['fund', 'fullname', 'type']

In [99]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(0, 3)
--------------------
1    11112
Name: tags, dtype: int64


In [100]:
te.shape

(0, 3)

### 关系: 公募基金持仓数据 

In [62]:
filename = 'rel_fund_listed_company_portfolio.csv'

In [63]:
labels = 'type'
keys = ['fund', 'fullname']

In [64]:
names = ['fund', 'fullname', 'ann_date', 'end_date', 'mkv', 'amount', 'stk_mkv_ratio',
         'stk_float_ratio', 'type']

In [69]:
te = filter_update_data(filename,path_origin,path_new,labels,keys,names)

(54559, 9)
--------------------
1    1404484
7      29049
6      18456
5       6777
4        208
2         69
Name: tags, dtype: int64


In [70]:
te.shape

(54559, 9)

In [72]:
te.tail()

Unnamed: 0,fund,fullname,ann_date,end_date,mkv,amount,stk_mkv_ratio,stk_float_ratio,type
1461735,1,山东天鹅棉业机械股份有限公司,20160827,20160630,39006.62,874.0,0.0,0.0,IN_PORTFOLIO
1461930,1,南极电商股份有限公司,20200121,20191231,149832015.87,NotChange,4.31,NotChange,IN_PORTFOLIO
1462074,1,保利联合化工控股集团股份有限公司,20130328,20121231,8927779.35,363657.0,0.15,0.21,IN_PORTFOLIO
1462075,1,保利发展控股集团股份有限公司,20200121,20191231,303983999.02,18787639.0,8.74,0.16,IN_PORTFOLIO
1462242,1,上海钢联电子商务股份有限公司,20200121,20191231,193610279.2,2488564.0,5.57,1.63,IN_PORTFOLIO


针对下面两条记录会在更新前后做对比

|fund|fullname|ann_date|end_date|mkv|amount|stk_mkv_ratio|stk_float_ratio|type|
|--|--|--|--|--|--|--|--|--|
|000001|南极电商股份有限公司|20200121|20191231|149832015.87|NotChange|4.31|NotChange|IN_PORTFOLIO|
|000001|保利联合化工控股集团股份有限公司|20130328|20121231|8927779.35|363657.0|0.15|0.21|IN_PORTFOLIO|