In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
path = '../data/tyc/'
data = pd.read_csv(path + 'comp_touzi_comp.csv')

In [3]:
df = data.set_index(data['date'])
df = df.sort_index()

df

Unnamed: 0_level_0,src_ind,src_cid,src_cname,dst_ind,dst_cid,dst_cname,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1984-02-10,2127,company/2668627629,深圳市向日葵基金合伙企业（有限合伙）,46726,company/732735218,深圳赤湾石油基地股份有限公司,1984-02-10
1985-08-20,1894,company/24212775,天津津融投资服务集团有限公司,27177,company/264022019,天津国际联合轮胎橡胶股份有限公司,1985-08-20
1987-03-31,2127,company/2668627629,深圳市向日葵基金合伙企业（有限合伙）,47645,company/79878189,招商银行股份有限公司,1987-03-31
1989-06-01,360,company/146170414,中冶建工集团有限公司,46934,company/7507939,中冶宝钢技术服务有限公司,1989-06-01
1990-03-19,3516,company/367314478,东证锦信投资管理有限公司,46284,company/6941042,虎彩印艺股份有限公司,1990-03-19
...,...,...,...,...,...,...,...
2020-11-30,95,company/113671400,北京文资数码投资管理有限公司,28153,company/28702414,北京伟德杰生物科技有限公司,2020-11-30
2020-11-30,245,company/13027951,和才（天津）股权投资基金管理有限公司,11683,company/227664287,天津达因建材有限公司,2020-11-30
2020-11-30,3109,company/3152144800,新余昆诺投资管理有限公司,37002,company/3224097797,北京十荟科技有限公司,2020-11-30
2020-11-30,4766,company/3389874109,上海张江浩珩创新股权投资管理有限公司,14097,company/2314841725,上海海栎创微电子有限公司,2020-11-30


In [4]:
print(f'user.max: {data.src_ind.max()}, item.max: {data.dst_ind.max()}')

src_set, dst_set = set(data.src_ind), set(data.dst_ind)
print(f'#user: {len(src_set)}, #item: {len(dst_set)}, user & item: {len(src_set&dst_set)}, user | item: {len(src_set|dst_set)}')

user.max: 4901, item.max: 49592
#user: 4902, #item: 45158, user & item: 467, user | item: 49593


In [5]:
# print #records after date
def print_stat(dt):
    comps = set(df[dt:]['src_ind'])
    print(f'{dt}~: {len(comps)} comps, {len(df[dt:])} records')

In [6]:
print_stat('2020-01-01')
print_stat('2019-11-30')
print_stat('2019-06-30')
print_stat('2019-01-01')
print_stat('2018-01-01')

2020-01-01~: 1560 comps, 4980 records
2019-11-30~: 1722 comps, 5753 records
2019-06-30~: 2268 comps, 8864 records
2019-01-01~: 2886 comps, 13697 records
2018-01-01~: 3835 comps, 27075 records


In [7]:
# remove names
df_ = df[['src_ind', 'dst_ind', 'date']]
df_.insert(3, 'label', 1)

train_df = df_[:'2018-12-31']
valid_df = df_['2019-01-01':'2019-12-31']
test_df = df_['2020-01-01':]

train_df

Unnamed: 0_level_0,src_ind,dst_ind,date,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984-02-10,2127,46726,1984-02-10,1
1985-08-20,1894,27177,1985-08-20,1
1987-03-31,2127,47645,1987-03-31,1
1989-06-01,360,46934,1989-06-01,1
1990-03-19,3516,46284,1990-03-19,1
...,...,...,...,...
2018-12-31,983,11459,2018-12-31,1
2018-12-31,1236,4435,2018-12-31,1
2018-12-31,2993,39274,2018-12-31,1
2018-12-31,1680,42963,2018-12-31,1


In [8]:
def negative_sampling(df, dst_set, k=99):
    neg_src = np.repeat(df['src_ind'], k)
    dst = list(dst_set)
    neg_dst = [random.choice(dst) for _ in range(len(neg_src))]
    test_neg_df = pd.DataFrame({'src_ind': neg_src, 'dst_ind': neg_dst, 'label': 0})
    return test_neg_df

In [9]:
def negative_sampling_by_user(df, dst_set, k=99):
    neg_src = np.repeat(list(set(df['src_ind'])), k)
    dst = list(dst_set)
    neg_dst = [random.choice(dst) for _ in range(len(neg_src))]
    test_neg_df = pd.DataFrame({'src_ind': neg_src, 'dst_ind': neg_dst, 'label': 0})
    return test_neg_df

In [10]:
random.seed(523)
test_neg_df = negative_sampling(test_df, set(data.dst_ind))
test_neg_user_df = negative_sampling_by_user(test_df, set(data.dst_ind))

len(test_df), len(test_neg_df), len(test_neg_user_df)

(4980, 493020, 154440)

In [11]:
def intersect(df1: pd.DataFrame, df2):
    df1 = df1.set_index(['src_ind', 'dst_ind'])
    df2 = df2.set_index(['src_ind', 'dst_ind'])
    return df1.join(df2, how='inner', lsuffix='_pos', rsuffix='_neg')

In [12]:
intersect(test_df, test_neg_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,label_pos,label_neg
src_ind,dst_ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,37516,2020-05-13,1,0
37,18565,2020-07-07,1,0
76,46099,2020-03-03,1,0
131,34600,2020-01-13,1,0
191,38623,2020-03-20,1,0
...,...,...,...,...
4396,22544,2020-03-11,1,0
4398,35668,2020-08-18,1,0
4434,15226,2020-09-27,1,0
4669,36767,2020-03-16,1,0


In [13]:
intersect(test_df, test_neg_user_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,label_pos,label_neg
src_ind,dst_ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
525,37046,2020-03-17,1,0
1325,31706,2020-04-24,1,0
3215,19818,2020-06-02,1,0
3301,44560,2020-05-28,1,0
3466,26803,2020-03-19,1,0
3850,19933,2020-09-24,1,0
3859,33914,2020-08-04,1,0


In [14]:
train_df.to_csv(path + 'train.csv', index=False)
valid_df.to_csv(path + 'valid.csv', index=False)
test_df.to_csv(path + 'test.csv', index=False)
test_neg_df.to_csv(path + 'test_neg.csv', index=False)
test_neg_user_df.to_csv(path + 'test_neg_user.csv', index=False)