In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from scipy.spatial.distance import cosine
from datetime import datetime

# Web of Science data
* Data should be collected from Web of Science Core Collection.
* The full data cannot be provided due to permission issues. Please download the data from Web of Science to match the structure of the sample data.

In [2]:
df = pd.read_excel('../data/papers.xlsx')

In [3]:
df = df.reset_index()
df.columns = ['paper_id'] + list(df.columns[1:])

In [4]:
df.head()

Unnamed: 0,paper_id,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,...,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),Web of Science Record
0,0,J,"Pickering, B; Lombardi, F; Pfenninger, S",,,,"Pickering, Bryn; Lombardi, Francesco; Pfenning...",,,Diversity of options to eliminate fossil fuels...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,2U9AT,35784823.0,"Green Published, hybrid",,,2023-10-31,WOS:000823446500009,0
1,1,J,"Fang, SM; Li, JD; Xu, Y; Shen, C; Guo, WL",,,,"Fang, Sunmiao; Li, Jidong; Xu, Ying; Shen, Chu...",,,Evaporating potential,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,ZZ3XE,,"hybrid, Green Submitted",,,2023-10-31,WOS:000773205200014,0
2,2,J,"Chen, ZY; Song, W; Yu, KB; Ge, JF; Zhang, JS; ...",,,,"Chen, Zhenyu; Song, Wei; Yu, Kuibao; Ge, Jinfe...",,,Small-molecular donor guest achieves rigid 18....,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,UR6JW,,hybrid,,,2023-10-31,WOS:000696854300013,0
3,3,J,"Zai, HC; Su, J; Zhu, C; Chen, YH; Ma, Y; Zhang...",,,,"Zai, Huachao; Su, Jie; Zhu, Cheng; Chen, Yihua...",,,Sandwiched electrode buffer for efficient and ...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,UC5CP,,hybrid,,,2023-10-31,WOS:000686543600018,0
4,4,J,"Zhang, CG; He, LX; Zhou, LL; Yang, O; Yuan, W;...",,,,"Zhang, Chuguo; He, Lixia; Zhou, Linglin; Yang,...",,,Active resonance triboelectric nanogenerator f...,...,Science Citation Index Expanded (SCI-EXPANDED),Chemistry; Energy & Fuels; Materials Science,SU3GI,,hybrid,Y,N,2023-10-31,WOS:000663028600023,0


In [5]:
df.columns

Index(['paper_id', 'Publication Type', 'Authors', 'Book Authors',
       'Book Editors', 'Book Group Authors', 'Author Full Names',
       'Book Author Full Names', 'Group Authors', 'Article Title',
       'Source Title', 'Book Series Title', 'Book Series Subtitle', 'Language',
       'Document Type', 'Conference Title', 'Conference Date',
       'Conference Location', 'Conference Sponsor', 'Conference Host',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses',
       'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date',
       'Publication Year', 'Vo

In [6]:
with open('../data/papers_with_pid.pkl', 'wb') as f:
    pickle.dump(df, f)

# Organization information

In [7]:
df[['paper_id', 'Affiliations']]

Unnamed: 0,paper_id,Affiliations
0,0,Swiss Federal Institutes of Technology Domain;...
1,1,Nanjing University of Aeronautics & Astronautics
2,2,Chinese Academy of Sciences; Ningbo Institute ...
3,3,Beijing Institute of Technology; Peking Univer...
4,4,Chinese Academy of Sciences; Beijing Institute...
...,...,...
94194,94194,University System of Ohio; University of Toled...
94195,94195,UDICE-French Research Universities; Universite...
94196,94196,Ulsan National Institute of Science & Technolo...
94197,94197,Chinese Academy of Sciences; Dalian Institute ...


In [8]:
affiliations = [a.split('; ') for a in df.Affiliations.values if type(a)==str]
affiliations = [item.lower() for sublist in affiliations for item in sublist]

aff_cnt = Counter(affiliations)
df_affiliation = pd.DataFrame(list(aff_cnt.items()), columns=['affiliation', 'count']).sort_values(by='count', ascending=False)
df_affiliation = df_affiliation.reset_index()
df_affiliation.columns = ['affiliation_id'] + list(df_affiliation.columns[1:])

In [9]:
df_affiliation.head()

Unnamed: 0,affiliation_id,affiliation,count
0,4,chinese academy of sciences,10918
1,22,united states department of energy (doe),3763
2,140,tsinghua university,2879
3,6,"university of chinese academy of sciences, cas",2454
4,172,xi'an jiaotong university,1894


In [10]:
aff2idx = {v: i for i, v in df_affiliation[['affiliation_id', 'affiliation']].values}
idx2aff = {i: v for i, v in df_affiliation[['affiliation_id', 'affiliation']].values}

In [11]:
with open('../data/affiliation2aid.pkl', 'wb') as f:
    pickle.dump(aff2idx, f)
with open('../data/aid2affiliation.pkl', 'wb') as f:
    pickle.dump(idx2aff, f)

# Paper-organization links

In [12]:
df_target = df.loc[~df.Abstract.isnull()].loc[~df.Affiliations.isnull()].loc[df.Abstract!=1016]
paper_affiliation = []
for pid, affs in df_target[['paper_id', 'Affiliations']].values:
    for aff in affs.split('; '):
        paper_affiliation.append([pid, aff2idx[aff.lower()]])
df_pap_aff = pd.DataFrame(paper_affiliation, columns=['paper_id', 'affiliation_id'])

# Documents for BERTopic

In [13]:
df.loc[df.paper_id.isin(df_pap_aff.paper_id)][['paper_id', 'Article Title', 'Abstract']]

Unnamed: 0,paper_id,Article Title,Abstract
0,0,Diversity of options to eliminate fossil fuels...,Disagreements persist on how to design a self-...
1,1,Evaporating potential,The conversion of ambient heat into electricit...
2,2,Small-molecular donor guest achieves rigid 18....,Incorporation of crystalline small molecule in...
3,3,Sandwiched electrode buffer for efficient and ...,With the rapid progress of perovskite solar ce...
4,4,Active resonance triboelectric nanogenerator f...,"Water-wave energy, as one of the important ren..."
...,...,...,...
94194,94194,Efficient two-terminal all-perovskite tandem s...,Multi-junction all-perovskite tandem solar cel...
94195,94195,A robust large-pore zirconium carboxylate meta...,The discovery of more-efficient and stable wat...
94196,94196,Multicomponent electrocatalyst with ultralow P...,Platinum is the most effective electrocatalyst...
94197,94197,Charge separation via asymmetric illumination ...,Solar-driven photocatalytic reactions provide ...


In [14]:
df['Publication Month'] = [d.split()[0] for d in df['Publication Date']]

df_former = df.loc[df['Publication Year'].isin((2018, 2019))]
df_former = pd.concat([df_former, df.loc[df['Publication Year']==2020].loc[df['Publication Month'].isin(['JAN','FEB','MAR','APR','MAY','JUN'])]])

df_latter = df.loc[df['Publication Year'].isin((2021, 2022))]
df_latter = pd.concat([df_latter, df.loc[df['Publication Year']==2020].loc[df['Publication Month'].isin(['JUL','AUG','SEP','OCT','NOV','DEC'])]])


In [15]:
index2document = {}
pid2did = {}
did2pid = {}
for did, (pid, title, abstract) in enumerate(df_former.loc[df_former.paper_id.isin(df_pap_aff.paper_id)][['paper_id', 'Article Title', 'Abstract']].values):
    index2document[did] = title +'; '+abstract
    pid2did[pid] = did
    did2pid[did] = pid

# Topic-paper-organization link
* Before run the following codes, BERTopic should be conducted with EnergyBERT.
  * BERTopic (https://maartengr.github.io/BERTopic/index.html)
  * EnergyBERT (https://huggingface.co/UNSW-MasterAI/EnergyBERT)
* Data for topic information for each document and document embeddings are required in this step.
  * ../data/bertopic_get_document_info.pkl: information related to identified topics for documents (BERTopic.get_document_info)
  * ../data/bertopic_document_embeddings.pkl: document embeddings by EnergyBERT

In [16]:
for bert_model in ['energybert']:  # ['matbert', 'energybert', 'matscibert', 'scibert']
    
    # get_document_info
    get_document_info = pd.read_pickle(f'../data/bertopic_get_document_info.pkl')
    get_document_info['paper_id'] = [did2pid[idx] for idx in get_document_info.index]
    get_document_info = get_document_info.reset_index()
    get_document_info.columns = ['doc_id'] + list(get_document_info.columns[1:])
    
    # read document embedding file
    embeddings = pd.read_pickle(f'../data/bertopic_document_embeddings.pkl')
    
    # calculate center of each cluster
    cluster_centers = []
    for topic_num in range(max(get_document_info.Topic)+1):
        tmp_document_info = get_document_info.loc[get_document_info.Topic==topic_num]
    
        tmp_embedding = np.zeros(embeddings.shape[1])
        for doc_id in tmp_document_info.doc_id:
            tmp_embedding = embeddings[doc_id]*tmp_document_info.loc[tmp_document_info.doc_id==doc_id].Probability.values[0]
        cluster_centers.append(tmp_embedding/tmp_document_info.Probability.sum())
    
    # assign outliers to the closest cluster
    outlier_doc_ids = get_document_info.loc[get_document_info.Topic==-1].doc_id.values
    similar_indices = {}
    for doc_id in outlier_doc_ids:
        source_vector = embeddings[doc_id]
        similarities = [1 - cosine(source_vector, target_vector) for target_vector in cluster_centers]
    
        most_similar_index = np.argmax(similarities)
        similar_indices[doc_id] = most_similar_index
    
    # save cluster information for each outlier
    document_info = get_document_info
    for key, value in similar_indices.items():
        mask = document_info.doc_id==key
        if mask.any():
            document_info.loc[mask, 'Topic'] = value

In [17]:
pid2tid = {pid: tid for tid, pid in document_info[['Topic', 'paper_id']].values}
df_pap_aff['topic_id'] = [pid2tid[pid] if pid in pid2tid.keys() else -1 for pid in df_pap_aff.paper_id]

# Disambiguate national information of organizations
* GRID data should be downloaded first (https://www.grid.ac/)

In [18]:
grid = pd.read_csv('../data/grid.csv')
grid['name'] = [a.lower() for a in grid.Name]
grid

Unnamed: 0,ID,Name,City,State,Country,name
0,grid.1001.0,Australian National University,Canberra,Australian Capital Territory,Australia,australian national university
1,grid.1002.3,Monash University,Melbourne,Victoria,Australia,monash university
2,grid.1003.2,University of Queensland,Brisbane,Queensland,Australia,university of queensland
3,grid.1004.5,Macquarie University,Sydney,New South Wales,Australia,macquarie university
4,grid.1005.4,UNSW Sydney,Sydney,New South Wales,Australia,unsw sydney
...,...,...,...,...,...,...
102387,grid.512910.e,GGD Amsterdam,Amsterdam,,Netherlands,ggd amsterdam
102388,grid.512911.f,International Center of Tropical Agriculture,Hanoi,,Vietnam,international center of tropical agriculture
102389,grid.512912.c,International Institute of Tropical Agriculture,Kano,,Nigeria,international institute of tropical agriculture
102390,grid.512913.d,IPS Central,Asunción,,Paraguay,ips central


In [19]:
grid_type = pd.read_csv('../data/grid_types.csv')
grid_type.columns = ['ID', 'type']
grid = pd.merge(grid, grid_type, on='ID', how='left')
grid

Unnamed: 0,ID,Name,City,State,Country,name,type
0,grid.1001.0,Australian National University,Canberra,Australian Capital Territory,Australia,australian national university,Education
1,grid.1002.3,Monash University,Melbourne,Victoria,Australia,monash university,Education
2,grid.1003.2,University of Queensland,Brisbane,Queensland,Australia,university of queensland,Education
3,grid.1004.5,Macquarie University,Sydney,New South Wales,Australia,macquarie university,Education
4,grid.1005.4,UNSW Sydney,Sydney,New South Wales,Australia,unsw sydney,Education
...,...,...,...,...,...,...,...
102388,grid.512910.e,GGD Amsterdam,Amsterdam,,Netherlands,ggd amsterdam,Healthcare
102389,grid.512911.f,International Center of Tropical Agriculture,Hanoi,,Vietnam,international center of tropical agriculture,Nonprofit
102390,grid.512912.c,International Institute of Tropical Agriculture,Kano,,Nigeria,international institute of tropical agriculture,Nonprofit
102391,grid.512913.d,IPS Central,Asunción,,Paraguay,ips central,Healthcare


In [20]:
grid['fid'] = [-1]*len(grid)
grid_name2cntr = {n: c for n, c in grid[['name', 'Country']].values}
cntr2idx = {c: i for i, c in enumerate(grid.Country.unique())}
idx2cntr = {i: c for c, i in cntr2idx.items()}

In [21]:
def get_jaccard_sim(str1, str2): 
    a = set(zip(*[str1[i:] for i in range(2)])) 
    b = set(zip(*[str2[i:] for i in range(2)])) 
    i_size = len(a.intersection(b))
    u_size = len(a) + len(b) - i_size
    return i_size / u_size if u_size!=0 else 0
    
aff2type = {}
aff2grid = {}
aid2cid = {}
print(f'start: {datetime.now()}')
for i, (aff, fid) in enumerate(aff2idx.items()):
    if len(grid.loc[grid.name==aff])!=0:
        aff2type[fid] = grid.loc[grid.name==aff].type
        aff2grid[fid] = grid.loc[grid.name==aff].values[0]
        aid2cid[fid] = cntr2idx[grid.loc[grid.name==aff].Country.values[0]]
    else:
        similarities = [(n, get_jaccard_sim(aff, n)) for n in grid.name]
        sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
        if sorted_similarities[0][1]==0:
            similar_aff = -1
        else:
            similar_aff = sorted_similarities[0][0]
        aff2type[fid] = grid.loc[grid.name==similar_aff].type
        aff2grid[fid] = grid.loc[grid.name==similar_aff].values[0]
        aid2cid[fid] = cntr2idx[grid.loc[grid.name==similar_aff].Country.values[0]]
    if (i+1)%100==0:
        print(f'{i+1}/{len(aff2idx)}: {datetime.now()}')
print(f'end: {datetime.now()}')

start: 2024-09-02 08:55:03.461199
100/6104: 2024-09-02 08:55:28.908342
200/6104: 2024-09-02 08:55:51.645413
300/6104: 2024-09-02 08:56:17.745718
400/6104: 2024-09-02 08:56:45.713140
500/6104: 2024-09-02 08:57:11.663521
600/6104: 2024-09-02 08:57:37.125875
700/6104: 2024-09-02 08:58:00.819666
800/6104: 2024-09-02 08:58:28.150528
900/6104: 2024-09-02 08:58:54.377307
1000/6104: 2024-09-02 08:59:25.042709
1100/6104: 2024-09-02 08:59:55.438045
1200/6104: 2024-09-02 09:00:26.532298
1300/6104: 2024-09-02 09:00:57.594434
1400/6104: 2024-09-02 09:01:25.900416
1500/6104: 2024-09-02 09:02:01.517478
1600/6104: 2024-09-02 09:02:32.060095
1700/6104: 2024-09-02 09:02:59.999378
1800/6104: 2024-09-02 09:03:27.290965
1900/6104: 2024-09-02 09:03:57.922447
2000/6104: 2024-09-02 09:04:32.076400
2100/6104: 2024-09-02 09:05:06.410779
2200/6104: 2024-09-02 09:05:43.158963
2300/6104: 2024-09-02 09:06:12.863114
2400/6104: 2024-09-02 09:06:46.423506
2500/6104: 2024-09-02 09:07:19.210976
2600/6104: 2024-09-02 09:

In [22]:
with open('../data/aff2grid.pkl', 'wb') as f:
    pickle.dump([aff2type, aff2grid], f)

In [24]:
df_pap_aff['country_id'] = [aid2cid[aid] for aid in df_pap_aff.affiliation_id]
df_pap_aff.head()

Unnamed: 0,paper_id,affiliation_id,topic_id,country_id
0,0,0,-1,9
1,0,1,-1,9
2,0,2,-1,19
3,1,3,-1,16
4,2,4,-1,16


In [25]:
df_pap_aff.to_csv('../data/preprocessed_data.csv', index=False)