In [1]:
import pandas as pd
import pickle
from itertools import combinations

# Load data

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv')
df.head()

Unnamed: 0,paper_id,affiliation_id,topic_id,country_id
0,0,0,-1,9
1,0,1,-1,9
2,0,2,-1,19
3,1,3,-1,16
4,2,4,-1,16


In [3]:
with open('../data/papers_with_pid.pkl', 'rb') as f:
    df_papers = pickle.load(f)
df_papers.head()

Unnamed: 0,paper_id,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,...,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),Web of Science Record
0,0,J,"Pickering, B; Lombardi, F; Pfenninger, S",,,,"Pickering, Bryn; Lombardi, Francesco; Pfenning...",,,Diversity of options to eliminate fossil fuels...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,2U9AT,35784823.0,"Green Published, hybrid",,,2023-10-31,WOS:000823446500009,0
1,1,J,"Fang, SM; Li, JD; Xu, Y; Shen, C; Guo, WL",,,,"Fang, Sunmiao; Li, Jidong; Xu, Ying; Shen, Chu...",,,Evaporating potential,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,ZZ3XE,,"hybrid, Green Submitted",,,2023-10-31,WOS:000773205200014,0
2,2,J,"Chen, ZY; Song, W; Yu, KB; Ge, JF; Zhang, JS; ...",,,,"Chen, Zhenyu; Song, Wei; Yu, Kuibao; Ge, Jinfe...",,,Small-molecular donor guest achieves rigid 18....,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,UR6JW,,hybrid,,,2023-10-31,WOS:000696854300013,0
3,3,J,"Zai, HC; Su, J; Zhu, C; Chen, YH; Ma, Y; Zhang...",,,,"Zai, Huachao; Su, Jie; Zhu, Cheng; Chen, Yihua...",,,Sandwiched electrode buffer for efficient and ...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,UC5CP,,hybrid,,,2023-10-31,WOS:000686543600018,0
4,4,J,"Zhang, CG; He, LX; Zhou, LL; Yang, O; Yuan, W;...",,,,"Zhang, Chuguo; He, Lixia; Zhou, Linglin; Yang,...",,,Active resonance triboelectric nanogenerator f...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,SU3GI,,hybrid,Y,N,2023-10-31,WOS:000663028600023,0


In [4]:
df_papers.columns

Index(['paper_id', 'Publication Type', 'Authors', 'Book Authors',
       'Book Editors', 'Book Group Authors', 'Author Full Names',
       'Book Author Full Names', 'Group Authors', 'Article Title',
       'Source Title', 'Book Series Title', 'Book Series Subtitle', 'Language',
       'Document Type', 'Conference Title', 'Conference Date',
       'Conference Location', 'Conference Sponsor', 'Conference Host',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses',
       'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Vo

In [5]:
df_papers['Publication Month'] = [d.split()[0] for d in df_papers['Publication Date']]

df_former = df_papers.loc[df_papers['Publication Year'].isin((2018, 2019))]
df_former = pd.concat([df_former, df_papers.loc[df_papers['Publication Year']==2020].loc[df_papers['Publication Month'].isin(['JAN','FEB','MAR','APR','MAY','JUN'])]])

df_latter = df_papers.loc[df_papers['Publication Year'].isin((2021, 2022))]
df_latter = pd.concat([df_latter, df_papers.loc[df_papers['Publication Year']==2020].loc[df_papers['Publication Month'].isin(['JUL','AUG','SEP','OCT','NOV','DEC'])]])

# Network construction for Period 1

In [6]:
df_f = df.loc[df.paper_id.isin(df_former.paper_id)]

In [7]:
df_f

Unnamed: 0,paper_id,affiliation_id,topic_id,country_id
32,8,24,0,3
33,8,24,0,3
34,8,22,0,3
35,8,25,0,3
36,8,24,0,3
...,...,...,...,...
281712,94198,277,30,102
281713,94198,146,30,3
281714,94198,147,30,3
281715,94198,3664,30,102


In [8]:
nodes_f = []
nodes_f += [f't{i}' for i in df_f.topic_id.unique()]
nodes_f += [f'p{i}' for i in df_f.paper_id.unique()]
nodes_f += [f'f{i}' for i in df_f.affiliation_id.unique()]
nodes_f += [f'c{i}' for i in df_f.country_id.unique()]

node2idx_f = {node: idx for idx, node in enumerate(nodes_f)}
idx2node_f = {idx: node for node, idx in node2idx_f.items()}

In [9]:
edges = ['topic_paper', 'topic_affiliation', 'topic_country', 
         'paper_affiliation', 'paper_country', 
         'affiliation_affiliation', 'affiliation_country',
         'country_country']
edge2idx = {edge: idx for idx, edge in enumerate(edges)}
idx2edge = {idx: edge for edge, idx in edge2idx.items()}

In [10]:
df_paper_aff = df_f[['paper_id', 'affiliation_id']].drop_duplicates()
collab = [list(df_paper_aff.loc[df_paper_aff.paper_id==p_id].affiliation_id) for p_id in df_paper_aff.paper_id.unique()]

collab_dict = {}
for i, c in enumerate(collab):
    if len(c)>=2:
        for comb in list(combinations(c, 2)):
            if comb in collab_dict.keys():
                collab_dict[comb] += 1
            else:
                collab_dict[comb] = 1

In [11]:
df_paper_cntr = df_f[['paper_id', 'country_id']].drop_duplicates()
collab = [list(df_paper_cntr.loc[df_paper_cntr.paper_id==p_id].country_id) for p_id in df_paper_cntr.paper_id.unique()]

cntr_collab_dict = {}
for i, c in enumerate(collab):
    if len(c)>=2:
        for comb in list(combinations(c, 2)):
            if comb in cntr_collab_dict.keys():
                cntr_collab_dict[comb] += 1
            else:
                cntr_collab_dict[comb] = 1

In [12]:
network = []
for i, (tid, pid) in enumerate(df_f[['topic_id', 'paper_id']].drop_duplicates().values):
    network.append([node2idx_f[f't{tid}'], edge2idx['topic_paper'], node2idx_f[f'p{pid}']])
    
for i, (tid, fid) in enumerate(df_f[['topic_id', 'paper_id', 'affiliation_id']].drop_duplicates()[['topic_id', 'affiliation_id']].values):
    network.append([node2idx_f[f't{tid}'], edge2idx['topic_affiliation'], node2idx_f[f'f{fid}']])

for i, (tid, cid) in enumerate(df_f[['topic_id', 'paper_id', 'affiliation_id', 'country_id']].drop_duplicates()[['topic_id', 'country_id']].values):
    network.append([node2idx_f[f't{tid}'], edge2idx['topic_country'], node2idx_f[f'c{cid}']])
    
for i, (pid, fid) in enumerate(df_f[['paper_id', 'affiliation_id']].drop_duplicates().values):
    network.append([node2idx_f[f'p{pid}'], edge2idx['paper_affiliation'], node2idx_f[f'f{fid}']])
    
for i, (pid, cid) in enumerate(df_f[['paper_id', 'country_id']].drop_duplicates().values):
    network.append([node2idx_f[f'p{pid}'], edge2idx['paper_country'], node2idx_f[f'c{cid}']])

for k, v in collab_dict.items():
    for _ in range(v):
        network.append([node2idx_f[f'f{k[0]}'], edge2idx['affiliation_affiliation'], node2idx_f[f'f{k[1]}']])

for i, (fid, cid) in enumerate(df_f[['affiliation_id', 'country_id']].drop_duplicates().values):
    network.append([node2idx_f[f'f{fid}'], edge2idx['affiliation_country'], node2idx_f[f'c{cid}']])
    
for k, v in cntr_collab_dict.items():
    for _ in range(v):
        network.append([node2idx_f[f'c{k[0]}'], edge2idx['country_country'], node2idx_f[f'c{k[1]}']])

In [13]:
# Data for TransE
pd.DataFrame(network).to_csv('../data/network_with_edge_p1.txt', index=False, header=False, sep='\t')

In [14]:
# Data for DeepWalk and Node2Vec
network_wo_links = [[n[0], n[2]] for n in network]
pd.DataFrame(network_wo_links).to_csv('../data/network_without_edge_p1.txt', index=False, header=False, sep='\t')

# Network construction for Period 2
* The topics for documents in Period 2 should be inferred.
  * ../data/bertopic_topics_for_p2.pkl: inferred topics and probabilities for each documents in Preiod 2. the file contains two dictionaries: {paper_id: topic_id}, {paper_id: probability}

In [16]:
df_l = df.loc[df.paper_id.isin(df_latter.paper_id)]

pid2topic, pid2prob = pd.read_pickle('../data/bertopic_topics_for_p2.pkl')
df_l['topic_id'] = [pid2topic[pid] for pid in df_l.paper_id]

df_l = df_l.loc[df_l.topic_id!=-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_l['topic_id'] = [pid2topic[pid] for pid in df_l.paper_id]


In [17]:
df_l

Unnamed: 0,paper_id,affiliation_id,topic_id,country_id
0,0,0,1,9
1,0,1,1,9
2,0,2,1,19
3,1,3,57,16
4,2,4,9,16
...,...,...,...,...
281625,94183,545,5,14
281626,94183,4445,5,1
281627,94183,6038,5,14
281628,94183,62,5,77


In [18]:
nodes_l = []
nodes_l += [f't{i}' for i in df_l.topic_id.unique()]
nodes_l += [f'p{i}' for i in df_l.paper_id.unique()]
nodes_l += [f'f{i}' for i in df_l.affiliation_id.unique()]
nodes_l += [f'c{i}' for i in df_l.country_id.unique()]

node2idx_l = {node: idx for idx, node in enumerate(nodes_l)}
idx2node_l = {idx: node for node, idx in node2idx_l.items()}

In [19]:
edges = ['topic_paper', 'topic_affiliation', 'topic_country', 
         'paper_affiliation', 'paper_country', 
         'affiliation_affiliation', 'affiliation_country',
         'country_country']
edge2idx = {edge: idx for idx, edge in enumerate(edges)}
idx2edge = {idx: edge for edge, idx in edge2idx.items()}

In [20]:
df_paper_aff = df_l[['paper_id', 'affiliation_id']].drop_duplicates()
collab = [list(df_paper_aff.loc[df_paper_aff.paper_id==p_id].affiliation_id) for p_id in df_paper_aff.paper_id.unique()]

collab_dict = {}
for i, c in enumerate(collab):
    if len(c)>=2:
        for comb in list(combinations(c, 2)):
            if comb in collab_dict.keys():
                collab_dict[comb] += 1
            else:
                collab_dict[comb] = 1

In [21]:
df_paper_cntr = df_l[['paper_id', 'country_id']].drop_duplicates()
collab = [list(df_paper_cntr.loc[df_paper_cntr.paper_id==p_id].country_id) for p_id in df_paper_cntr.paper_id.unique()]

cntr_collab_dict = {}
for i, c in enumerate(collab):
    if len(c)>=2:
        for comb in list(combinations(c, 2)):
            if comb in cntr_collab_dict.keys():
                cntr_collab_dict[comb] += 1
            else:
                cntr_collab_dict[comb] = 1

In [22]:
network = []
for i, (tid, pid) in enumerate(df_l[['topic_id', 'paper_id']].drop_duplicates().values):
    network.append([node2idx_l[f't{tid}'], edge2idx['topic_paper'], node2idx_l[f'p{pid}']])
    
for i, (tid, fid) in enumerate(df_l[['topic_id', 'paper_id', 'affiliation_id']].drop_duplicates()[['topic_id', 'affiliation_id']].values):
    network.append([node2idx_l[f't{tid}'], edge2idx['topic_affiliation'], node2idx_l[f'f{fid}']])

for i, (tid, cid) in enumerate(df_l[['topic_id', 'paper_id', 'affiliation_id', 'country_id']].drop_duplicates()[['topic_id', 'country_id']].values):
    network.append([node2idx_l[f't{tid}'], edge2idx['topic_country'], node2idx_l[f'c{cid}']])
    
for i, (pid, fid) in enumerate(df_l[['paper_id', 'affiliation_id']].drop_duplicates().values):
    network.append([node2idx_l[f'p{pid}'], edge2idx['paper_affiliation'], node2idx_l[f'f{fid}']])
    
for i, (pid, cid) in enumerate(df_l[['paper_id', 'country_id']].drop_duplicates().values):
    network.append([node2idx_l[f'p{pid}'], edge2idx['paper_country'], node2idx_l[f'c{cid}']])

for k, v in collab_dict.items():
    for _ in range(v):
        network.append([node2idx_l[f'f{k[0]}'], edge2idx['affiliation_affiliation'], node2idx_l[f'f{k[1]}']])

for i, (fid, cid) in enumerate(df_l[['affiliation_id', 'country_id']].drop_duplicates().values):
    network.append([node2idx_l[f'f{fid}'], edge2idx['affiliation_country'], node2idx_l[f'c{cid}']])
    
for k, v in cntr_collab_dict.items():
    for _ in range(v):
        network.append([node2idx_l[f'c{k[0]}'], edge2idx['country_country'], node2idx_l[f'c{k[1]}']])

In [23]:
# Data for TransE
pd.DataFrame(network).to_csv('../data/network_with_edge_p2.txt', index=False, header=False, sep='\t')

In [24]:
# Data for DeepWalk and Node2Vec
network_wo_links = [[n[0], n[2]] for n in network]
pd.DataFrame(network_wo_links).to_csv('../data/network_without_edge_p2.txt', index=False, header=False, sep='\t')

# Network embedding
* For the next step, network embedding should be conducted. Followings are network embedding models and Python libraries used in the study.
  * DeepWalk (https://github.com/fengduqianhe/GraphEmbedding-master)
  * Node2Vec (https://github.com/eliorc/node2vec)
  * TransE (https://github.com/pykeen/pykeen)