In [1]:
from tqdm import tqdm
import pickle
import json
from collections import Counter, defaultdict
import pandas as pd
import csv

import sys
sys.path.append('v16')

from constants import *
from constants_id import *
from indexing import Indexer, IndexType, BasicInvertedIndex
from document_preprocessor import RegexTokenizer

In [2]:
with open(DOCID_TO_ID_PATH, 'rb') as f:
    docid_to_id = pickle.load(f)

In [2]:
document_preprocessor = RegexTokenizer('\\w+')
stopwords = set()
with open(STOPWORD_PATH, "r") as f:
    for word in f:
        stopwords.add(word.strip())
title_index = Indexer.create_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     document_preprocessor, stopwords, docid_to_id, minimum_word_frequency=1, text_key="title")
title_index.save(PAPER_TITLE_INDEX)

abstract_index = Indexer.create_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     document_preprocessor, stopwords, docid_to_id, minimum_word_frequency=50, text_key="abstract")
abstract_index.save(PAPER_ABSTRACT_INDEX)

6404472it [11:16, 9468.25it/s] 


In [4]:
docid_list = []
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
        doc = json.loads(line)
        if doc['abstract'] == '' or doc['n_citation'] <=20:
            continue
        
        docid_list.append(doc['id'])

with open(DOCID_LIST_PATH, 'wb') as f:
    pickle.dump(docid_list, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 6404472/6404472 [01:46<00:00, 60138.20it/s]


In [2]:
with open(DOCID_LIST_PATH, 'rb') as f:
    docid_list = pickle.load(f)
    
docid_to_id = {}
for i, docid in enumerate(tqdm(docid_list, total=TOTAL_PAPER_COUNT)):
    docid_to_id[docid] = i
    
with open(DOCID_TO_ID_PATH, 'wb') as f:
    pickle.dump(docid_to_id, f, protocol=pickle.HIGHEST_PROTOCOL)

 88%|████████▊ | 5609482/6404472 [00:03<00:00, 1636722.50it/s]


In [11]:
doc_all_dict = dict()
doc_all_dict['docid'] = []
doc_all_dict['title'] = []
doc_all_dict['abstract'] = []
doc_all_dict['year'] = []
doc_all_dict['author'] = []
doc_all_dict['authorid'] = []
doc_all_dict['org'] = []

with open(DOCID_TO_ID_PATH, 'rb') as f:
    docid_to_id = pickle.load(f)

with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
        doc = json.loads(line)
        if doc['abstract'] == '':
            continue
        
        doc_all_dict['docid'].append(docid_to_id[doc['id']])
        doc_all_dict['title'].append(doc['title'])
        doc_all_dict['abstract'].append(doc['abstract'])
        doc_all_dict['year'].append(doc['year'])
        if len(doc['authors']) == 0:
            doc_all_dict['authorid'].append('')
            doc_all_dict['author'].append('')
            doc_all_dict['org'].append('')
        else:
            for i, a in enumerate(doc['authors']):
                if i == 0:
                    doc_all_dict['authorid'].append(doc['authors'][i]['id'])
                    doc_all_dict['author'].append(doc['authors'][i]['name'])
                    doc_all_dict['org'].append(doc['authors'][i]['org'])
                else:
                    doc_all_dict['docid'].append(docid_to_id[doc['id']])
                    doc_all_dict['title'].append(doc['title'])
                    doc_all_dict['abstract'].append(doc['abstract'])
                    doc_all_dict['year'].append(doc['year'])
                    
                    doc_all_dict['authorid'].append(doc['authors'][i]['id'])
                    doc_all_dict['author'].append(doc['authors'][i]['name'])
                    doc_all_dict['org'].append(doc['authors'][i]['org'])

doc_all_df = pd.DataFrame(doc_all_dict)
doc_all_df.to_csv(f"{SCRACTCH_PATH}/doc_DBLP.csv")

100%|██████████| 6404472/6404472 [02:15<00:00, 47118.61it/s]


In [9]:
doc

{'id': '5390877920f70186a0d2cb7c',
 'title': 'A note on denial-of-service in operating systems',
 'abstract': 'A simple and general definition of denial-of-service in operating systems is presented. It is argued that no current protection mechanism nor model resolves this problem in any demonstrable way. The notion of interuser dependency is introduced and identified as the common cause for all problem instances. Decomposition of operating systems into hierarchies of services is assumed for the discovery of denial-of-service instances.',
 'keywords': ['interuser dependency',
  'general definition',
  'denial-of-service instance',
  'common cause',
  'problem instance',
  'current protection mechanism',
  'information retrieval',
  'operating systems',
  'denial of service',
  'formal verification',
  'control systems',
  'probability density function',
  'authorization',
  'hardware',
  'operating system',
  'data mining'],
 'year': 1984,
 'authors': [{'id': '53f42ed4dabfaedd74d498b4',

In [2]:
with open(DOCID_TO_ID_PATH, 'rb') as f:
    docid_to_id = pickle.load(f)

In [8]:
# print("Collect doc categories")
# docid_to_categories = {}
# docid_list = []
# with open(PAPER_DATA_PATH, 'r') as f:
#     for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
#         doc = json.loads(line)
#         if doc['abstract'] == '':
#             continue
            
#         docid_list.append(doc['id'])
#         cat_list = []
#         if 'keywords' in doc:
#             for j in doc['keywords']:
#                 cat_list.append(j)
#         docid_to_categories[doc['id']] = cat_list

# category_counts = Counter()
# for cats in tqdm(docid_to_categories.values(), total=len(docid_to_categories)):
#     for c in cats:
#         category_counts[c] += 1
# recognized_categories = set(
#     [cat for cat, count in category_counts.items() if count >= CATEGORIES_COUNT_CUTOFF])
    
# print("Create doc_category_info")
# doc_category_info = {}
# for docid, cats in tqdm(docid_to_categories.items(), total=len(docid_to_categories)):
#     valid_cats = [c for c in cats if c in recognized_categories]
#     doc_category_info[docid] = valid_cats
# print("--- Done ---")

# with open(DOC_CATEGORY_INFO_PATH, 'wb') as f:
#     pickle.dump(doc_category_info, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(RECOG_CATEGORY_PATH, 'wb') as f:
#     pickle.dump(recognized_categories, f, protocol=pickle.HIGHEST_PROTOCOL)
    

####################################################################################
    
with open(DOC_CATEGORY_INFO_PATH, 'rb') as f:
    doc_category_info = pickle.load(f)
id_category_info = dict()
for docid in tqdm(doc_category_info, total=TOTAL_PAPER_COUNT):
    id_category_info[docid_to_id[docid]] = doc_category_info[docid]
with open(ID_CATEGORY_INFO_PATH, 'wb') as f:
    pickle.dump(id_category_info, f, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# print("Collect year release")
# docid_to_yr = {}
# with open(PAPER_DATA_PATH, 'r') as f:
#     for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
#         doc = json.loads(line)
#         if doc['abstract'] == '':
#             continue
            
#         docid_to_yr[doc['id']] = doc['year']
# with open(DOCID_TO_YEAR_RELEASE_PATH, 'wb') as f:
#     pickle.dump(docid_to_yr, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    
####################################################################################
    
with open(DOCID_TO_YEAR_RELEASE_PATH, 'rb') as f:
    docid_to_yr = pickle.load(f)
id_to_yr = dict()
for docid in tqdm(doc_category_info, total=TOTAL_PAPER_COUNT):
    id_to_yr[docid_to_id[docid]] = docid_to_yr[docid]
with open(ID_TO_YEAR_RELEASE_PATH, 'wb') as f:
    pickle.dump(id_to_yr, f, protocol=pickle.HIGHEST_PROTOCOL)

 88%|████████▊ | 5609482/6404472 [00:04<00:00, 1127097.28it/s]


In [14]:
# print("Collect citation")
# docid_to_citation = {}
# with open(PAPER_DATA_PATH, 'r') as f:
#     for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
#         doc = json.loads(line)
#         if doc['abstract'] == '':
#             continue
#         if 'n_citation' in doc:
#             docid_to_citation[doc['id']] = doc['n_citation']
# with open(DOCID_TO_CITATION_PATH, 'wb') as f:
#     pickle.dump(docid_to_citation, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    
####################################################################################
    
with open(DOCID_TO_CITATION_PATH, 'rb') as f:
    docid_to_citation = pickle.load(f)
id_to_citation = dict()
for docid in tqdm(docid_to_citation, total=TOTAL_PAPER_COUNT):
    id_to_citation[docid_to_id[docid]] = docid_to_citation[docid]
with open(ID_TO_CITATION_PATH, 'wb') as f:
    pickle.dump(id_to_citation, f, protocol=pickle.HIGHEST_PROTOCOL)

 88%|████████▊ | 5609482/6404472 [00:04<00:00, 1328341.34it/s]


In [22]:
# print("Collect authors data")

# """
# author_collection = {
#     author_id : {
#         'name' : author_name,
#         'org' : author_org,
#         'docid' : {
#             doc_id : author_order
#         }
#     }
# }
# """

# docid_to_authorid = {}
# authorid_to_author_name = {}
# author_collection = {}
# with open(PAPER_DATA_PATH, 'r') as f:
#     for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
#         doc = json.loads(line)
#         if doc['abstract'] == '':
#             continue
        
#         if 'authors' not in doc:
#             continue
            
#         docid_to_authorid[doc['id']] = [author['id'] for author in doc['authors']]
        
#         for i, author in enumerate(doc['authors']):

#             author_collection[author['id']] = {}
#             author_collection[author['id']]['name'] = author['name']
            
#             if 'org' not in author_collection[author['id']]:
#                 author_collection[author['id']]['org'] = set()
#             if 'org' in author:
#                 author_collection[author['id']]['org'].add(author['org'])
            
#             if 'docid' not in author_collection[author['id']]:
#                 author_collection[author['id']]['docid'] = {}
#             author_collection[author['id']]['docid'][doc['id']] =  i
            
            
#             if author['id'] not in authorid_to_author_name:
#                 authorid_to_author_name[author['id']] = author['name']

# with open(DOCID_TO_AUTHORID_PATH, 'wb') as f:
#     pickle.dump(docid_to_authorid, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(AUTHOR_COLLECTION_PATH, 'wb') as f:
#     pickle.dump(author_collection, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(AUTHORID_TO_AUTHOR_NAME_PATH, 'wb') as f:
#     pickle.dump(authorid_to_author_name, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    
####################################################################################
    
with open(DOCID_TO_AUTHORID_PATH, 'rb') as f:
    docid_to_authorid = pickle.load(f)
id_to_authorid = dict()
for docid in tqdm(docid_to_authorid, total=TOTAL_PAPER_COUNT):
    id_to_authorid[docid_to_id[docid]] = docid_to_authorid[docid]
with open(ID_TO_AUTHORID_PATH, 'wb') as f:
    pickle.dump(id_to_authorid, f, protocol=pickle.HIGHEST_PROTOCOL)

Collect authors data


100%|██████████| 6404472/6404472 [08:03<00:00, 13254.32it/s] 


In [None]:
print("Collect network stats info")
with open(PAPER_NETWORK_METRICS_PATH, 'rb') as f:
    network_stat_dict = pickle.load(f)
    

with open(DOCID_LIST_PATH, 'rb') as f:
    docid_list = pickle.load(f)
docid_set = set(docid_list)

docid_to_network_features = defaultdict(dict)
id_to_network_features = defaultdict(dict)
for i, docid in enumerate(tqdm(network_stat_dict['node_names'])):
    if docid not in docid_set:
        continue
        
    for score in ['pgr_scores', 'hub_scores', 'auth_scores', 'pca_scores']:
    # for score in ['pgr_scores', 'hub_scores', 'auth_scores', 'paris_hierarchy', 'pca_scores']:
    # for score in ['pgr_scores', 'hub_scores', 'auth_scores', 'paris_hierarchy', 'louvain_membership', 'pca_scores']:
        # if score == 'louvain_membership':
        #     docid_to_network_features[docid][score] = network_stat_dict['louvain_membership'][i].toarray().tolist()[0]
        # if score in ['paris_hierarchy', 'pca_scores']:
        #     docid_to_network_features[docid][score] = network_stat_dict[score][i].tolist()
        if score in ['pca_scores']:
            docid_to_network_features[docid][score] = network_stat_dict[score][i].tolist()
            id_to_network_features[docid_to_id[docid]][score] = network_stat_dict[score][i].tolist()
        else:
            docid_to_network_features[docid][score] = network_stat_dict[score][i]
            id_to_network_features[docid_to_id[docid]][score] = network_stat_dict[score][i]
            
with open(DOCID_TO_NETWORK_FEATURES_PATH, 'wb') as f:
    pickle.dump(docid_to_network_features, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(ID_TO_NETWORK_FEATURES_PATH, 'wb') as f:
    pickle.dump(id_to_network_features, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    
####################################################################################
    
# with open(DOCID_TO_NETWORK_FEATURES_PATH, 'rb') as f:
#     docid_to_network_features = pickle.load(f)
# id_to_network_features = dict()
# for docid in tqdm(docid_to_network_features, total=TOTAL_PAPER_COUNT):
#     id_to_network_features[docid_to_id[docid]] = docid_to_network_features[docid]
# with open(ID_TO_NETWORK_FEATURES_PATH, 'wb') as f:
#     pickle.dump(id_to_network_features, f, protocol=pickle.HIGHEST_PROTOCOL)

# Author

In [10]:
# author_metric_df = pd.read_csv("dataset/author_metrics.csv")
# authorid_list = set(author_metric_df.dropna()[author_metric_df['total_citations']>100][author_metric_df['h_index']>10]['author_id'].to_list())

# with open(AUTHORID_LIST_PATH, 'wb') as f:
#     pickle.dump(authorid_list, f, protocol=pickle.HIGHEST_PROTOCOL)

  authorid_list = set(author_metric_df.dropna()[author_metric_df['total_citations']>100][author_metric_df['h_index']>10]['author_id'].to_list())


In [2]:
with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = list(pickle.load(f))
with open(DOCID_LIST_PATH, 'rb') as f:
    docid_list = pickle.load(f)

In [3]:
len(docid_list)

1720036

In [2]:
with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = list(pickle.load(f))

authorid_to_id = {}
for i, authorid in enumerate(tqdm(authorid_list)):
    authorid_to_id[authorid] = i
    
with open(AUTHORID_TO_ID_PATH, 'wb') as f:
    pickle.dump(authorid_to_id, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 137773/137773 [00:00<00:00, 1988191.33it/s]


In [4]:
AUTHOR_ON_TITLE_INDEX

'/gpfs/accounts/stats_dept_root/stats_dept1/nawatsw/si699/author_on_title_index'

In [2]:
with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = pickle.load(f)
    
with open(AUTHORID_TO_ID_PATH, 'rb') as f:
    authorid_to_id = pickle.load(f)
    
document_preprocessor = RegexTokenizer('\\w+')
stopwords = set()
with open(STOPWORD_PATH, "r") as f:
    for word in f:
        stopwords.add(word.strip())
        
author_index = Indexer.create_author_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     document_preprocessor, stopwords, authorid_to_id, authorid_list, minimum_word_frequency=50, text_key="abstract")
author_index.save(AUTHOR_INDEX)

author_on_title_index = Indexer.create_author_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                                                    document_preprocessor, stopwords, authorid_to_id, authorid_list, minimum_word_frequency=1, text_key="title")
author_on_title_index.save(AUTHOR_ON_TITLE_INDEX)

6404472it [05:48, 18376.61it/s]
100%|██████████| 137725/137725 [36:46<00:00, 62.41it/s]  
6404472it [02:22, 45039.25it/s]
100%|██████████| 137725/137725 [03:02<00:00, 753.78it/s] 


In [3]:
with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = pickle.load(f)
    
with open(AUTHORID_TO_ID_PATH, 'rb') as f:
    authorid_to_id = pickle.load(f)
    
author_pr_df = pd.read_csv("dataset/author_pagerank.csv")
author_pr_df = author_pr_df[author_pr_df['author_id'].isin(authorid_list)]
author_metric_df = pd.read_csv("dataset/author_metrics.csv")
author_metric_df = author_metric_df[author_metric_df['author_id'].isin(authorid_list)]
author_ft_df = pd.merge(author_pr_df, author_metric_df, on='author_id') 
           
author_ft_df['author_id'] = author_ft_df['author_id'].apply(lambda x: authorid_to_id[x])

authorid_to_features = {}
ft_cols = ['pagerank', 'h_index', 'total_citations']
for i in range(author_ft_df.shape[0]):
    author_id = author_ft_df.loc[i, 'author_id']
    authorid_to_features[author_id] = {}
    for col in ft_cols:
        authorid_to_features[author_id][col] = author_ft_df.loc[i, col]

with open(AUTHORID_TO_FEATURES_PATH, 'wb') as f:
    pickle.dump(authorid_to_features, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
# with open(AUTHORID_TO_FEATURES_PATH, 'rb') as f:
#     authorid_to_features = pickle.load(f)

In [11]:
CSV_FILE = f'{SCRACTCH_PATH}/paper_author_org/author_level1.csv'
data = []
author_citations = defaultdict(int)
author_dict = defaultdict(set)

with open(PAPER_DATA_PATH, 'r') as file:
    # Iterate through each line in the file
    for line in tqdm(file):
        # Parse JSON object from the line
        doc = json.loads(line.strip())
        # Extract total citations for the document
        n_citation = doc.get('n_citation', 0)
        for author in doc.get('authors', []):
            authorid = author.get('id', 'Unknown')
            author_name = author.get('name', 'Unknown')
            org = author.get('org', 'Unknown')
            # Accumulate citations for the author
            author_citations[authorid] += n_citation
            author_dict[authorid].add((author_name, org))
            
csv_columns = ['authorid', 'author', 'n_citations', 'org']
with open(CSV_FILE, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    for authorid, citations in author_citations.items():
        for (author_name, org) in author_dict[authorid]:
            if authorid == 'Unknown' or author_name == 'Unknown' or len(authorid) == 0:
                continue
            writer.writerow({'authorid': authorid, 'author': author_name, 'n_citations': citations, 'org': org})

In [12]:
all_author_df1 =  pd.read_csv(f'{SCRACTCH_PATH}/paper_author_org/author_level1.csv')
all_author_df1

Unnamed: 0,authorid,author,n_citations,org
0,53f42ed4dabfaedd74d498b4,Virgil D. Gligor,7378,"Carnegie Mellon Univ, CyLab, Pittsburgh, PA 15..."
1,53f42ed4dabfaedd74d498b4,Virgil D. Gligor,7378,carnegie mellon university
2,53f42ed4dabfaedd74d498b4,Virgil D. Gligor,7378,"Carnegie Mellon University, Pittsburgh, PA, USA"
3,53f42ed4dabfaedd74d498b4,Virgil Gligor,7378,"Dept. of Electrical and Computer Engineering, ..."
4,53f42ed4dabfaedd74d498b4,Virgil D. Gligor,7378,"Carnegie Mellon University, Pittsbrgh, PA, APO AA"
...,...,...,...,...
12620012,6557189e145e064b7afd4e42,Aharon Gomez,0,
12620013,53f44f3edabfaedd74e11b18,Tobias J. Erb,0,
12620014,63afb72684ab04bd7fbc1637,Lorane Le Franc,0,
12620015,53f4744edabfaee43ed2d1f9,Bruno Petton,0,


In [13]:
author_50k_df = pd.read_csv("dataset/google_api_authors_50K.csv")
author_50k_df = author_50k_df.rename(columns={'name':'author'})
author_50k_df

Unnamed: 0,author,link,serpapi_link,author_id,affiliations,email,cited_by,interests,thumbnail,id,query
0,Christopher D Manning,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=1zmD...,1zmDOdwAAAAJ,"Professor of Computer Science and Linguistics,...",Verified email at stanford.edu,239366,"[{'title': 'Natural Language Processing', 'ser...",https://scholar.googleusercontent.com/citation...,0,Natural Language Processing
1,Riccardo Di Sipio,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=FX5A...,FX5AyXoAAAAJ,Ceridian HCM Inc.,,199603,"[{'title': 'machine learning', 'serpapi_link':...",https://scholar.googleusercontent.com/citation...,1,Natural Language Processing
2,Richard Socher,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=FaOc...,FaOcyfMAAAAJ,you.com,Verified email at stanford.edu,170852,"[{'title': 'natural language processing', 'ser...",https://scholar.googleusercontent.com/citation...,2,Natural Language Processing
3,Sara Borroni,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=PGEe...,PGEep1MAAAAJ,Data Scientist at Pangea Formazione Srl,Verified email at pangeaformazione.it,157802,"[{'title': 'Artificial Intelligence', 'serpapi...",https://scholar.googleusercontent.com/citation...,3,Natural Language Processing
4,Tomas Mikolov,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=oBu8...,oBu8kMMAAAAJ,"Senior Researcher, CIIRC CTU",Verified email at cvut.cz,156666,"[{'title': 'Artificial Intelligence', 'serpapi...",https://scholar.googleusercontent.com/citation...,4,Natural Language Processing
...,...,...,...,...,...,...,...,...,...,...,...
4895,Nan Jiang,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=cFT1...,cFT1sL8AAAAJ,"Computer Science Department, Purdue University",Verified email at purdue.edu,393,"[{'title': 'Natural Language Processing', 'ser...",https://scholar.googleusercontent.com/citation...,95,Large Language Model
4896,Yizhao Gao,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=-CIH...,-CIHwykAAAAJ,"Ph.D. student, Renmin University of China",Verified email at ruc.edu.cn,393,"[{'title': 'Multi-Modal Pre-Training', 'serpap...",https://scholar.google.com/citations/images/av...,96,Large Language Model
4897,Junzhi Cao,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=1rSX...,1rSXKxkAAAAJ,New York University,Verified email at nyu.edu,388,"[{'title': 'Venture Capital', 'serpapi_link': ...",https://scholar.googleusercontent.com/citation...,97,Large Language Model
4898,Yaoru Pan,https://scholar.google.com/citations?hl=en&use...,https://serpapi.com/search.json?author_id=RxsZ...,RxsZZugAAAAJ,Postdoc. University of Helsinki,Verified email at helsinki.fi,388,[{'title': 'Artificial intelligence in ecologi...,https://scholar.googleusercontent.com/citation...,98,Large Language Model


In [15]:
def jaccard_sim(x, y):
    if x!=x or y!=y:
        return 0
    num = len(set(str(x)).intersection(set(str(y))))
    den = len(set(str(x)).union(set(str(y))))
    return num/den

def select_min_edit_distance(group, col1, col2):
    # Calculate the edit distance for each row in the group
    group[f'{col1}_simmilarity'] = group.apply(lambda x: jaccard_sim(x[col1],x[col2]), axis=1)
    # Return the row with the minimum edit distance
    return group.loc[group[f'{col1}_simmilarity'].idxmax()]

with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = list(pickle.load(f))
    
with open(AUTHORID_TO_ID_PATH, 'rb') as f:
    authorid_to_id = pickle.load(f)

In [17]:
all_author_df = all_author_df1.copy()
all_author_df = all_author_df[all_author_df['authorid'].isin(authorid_list)]
all_author_df = pd.merge(all_author_df, author_50k_df, on=["author"], how='left')
all_author_df = all_author_df.groupby(['author', 'authorid']).apply(select_min_edit_distance, "org", "affiliations").reset_index(drop=True)
all_author_df['authorid'] = all_author_df['authorid'].apply(lambda x: authorid_to_id[x])
print(all_author_df.shape)
all_author_df

(519291, 15)


Unnamed: 0,authorid,author,n_citations,org,link,serpapi_link,author_id,affiliations,email,cited_by,interests,thumbnail,id,query,org_simmilarity
0,3093,\n Abolfazl\n Falahati,976,Department of Electrical Engineering (DCCS Lab...,,,,,,,,,,,0.0
1,22786,\n Guy E. Blelloch,8344,"Computer Science Department, Carnegie Mellon U...",,,,,,,,,,,0.0
2,67430,\nDino Pedreschi,14655,"KDD Laboratory, Department of Computer Science...",,,,,,,,,,,0.0
3,104272,\nEnrico Tronci,1818,"Dipartimento di Informatica, Università di Rom...",,,,,,,,,,,0.0
4,40018,\nLisheng Jiang,667,"Business School, Sichuan University, China",,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519286,50103,魏武,526,"Department of Automation, Tsinghua University,...",,,,,,,,,,,0.0
519287,100409,黄刘生,8143,,,,,,,,,,,,0.0
519288,116721,黄新力,457,,,,,,,,,,,,0.0
519289,38927,黄联芬,1862,,,,,,,,,,,,0.0


In [20]:
all_author_df.groupby('authorid').first()

Unnamed: 0_level_0,author,n_citations,org,link,serpapi_link,author_id,affiliations,email,cited_by,interests,thumbnail,id,query,org_simmilarity
authorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Nijwm Wary,249,"University of Toronto, Department of E&ECE, Ca...",,,,,,,,,,,0.0
1,Chein-I Chang,1161,School of Physics and Optoelectronic Engineeri...,,,,,,,,,,,0.0
2,Y u-Te Wu,1080,"Integrated Brain Research Laboratory, Departme...",,,,,,,,,,,0.0
3,María Jesús Rodríguez-Triana,1364,"Tallinn Univ, Tallinn, Estonia",,,,,,,,,,,0.0
4,Sun I Kim,2075,"Department of Biomedical Engineering, Hanyang ...",,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137768,Francisco Vazquez Gallego,882,Centre Tecnològic de Telecomunicacions de Cata...,,,,,,,,,,,0.0
137769,Jiaochan Hu,398,"Dalian Maritime Univ, Coll Environm Sci & Engn...",,,,,,,,,,,0.0
137770,P. Somervuo,1191,"Neural Networks Res. Centre, Helsinki Univ. of...",,,,,,,,,,,0.0
137771,YuFan Cheng,450,National Key Laboratory of Science and Technol...,,,,,,,,,,,0.0


In [26]:
# all_author_df = all_author_df.groupby('authorid').first().reset_index()
all_author_df[['author', 'authorid', 'n_citations', 'org', 'serpapi_link']].to_csv(f'{SCRACTCH_PATH}/paper_author_org/author_level_edited.csv')
all_author_df[['author', 'authorid', 'n_citations', 'org', 'serpapi_link']]

Unnamed: 0,author,authorid,n_citations,org,serpapi_link
0,Nijwm Wary,0,249,"University of Toronto, Department of E&ECE, Ca...",
1,Chein-I Chang,1,1161,School of Physics and Optoelectronic Engineeri...,
2,Y u-Te Wu,2,1080,"Integrated Brain Research Laboratory, Departme...",
3,María Jesús Rodríguez-Triana,3,1364,"Tallinn Univ, Tallinn, Estonia",
4,Sun I Kim,4,2075,"Department of Biomedical Engineering, Hanyang ...",
...,...,...,...,...,...
137768,Francisco Vazquez Gallego,137768,882,Centre Tecnològic de Telecomunicacions de Cata...,
137769,Jiaochan Hu,137769,398,"Dalian Maritime Univ, Coll Environm Sci & Engn...",
137770,P. Somervuo,137770,1191,"Neural Networks Res. Centre, Helsinki Univ. of...",
137771,YuFan Cheng,137771,450,National Key Laboratory of Science and Technol...,


## Org

In [9]:
doc_all_df = pd.read_csv(f"{SCRACTCH_PATH}/doc_DBLP.csv")

  doc_all_df = pd.read_csv(f"{SCRACTCH_PATH}/doc_DBLP.csv")


In [10]:
doc_all_df

Unnamed: 0.1,Unnamed: 0,docid,title,abstract,year,author,authorid,org
0,0,0.0,A note on denial-of-service in operating systems,A simple and general definition of denial-of-s...,1984.0,Virgil D. Gligor,53f42ed4dabfaedd74d498b4,"Department of Electrical Engineering, Universi..."
1,1,1.0,Top-Down Construction of 3-D Mechanical Object...,First Page of the Article,1984.0,Hiroshi Yoshiura,56055d5945cedb33965f78ad,"Hitachi Research Laboratory, Hitachi Ltd."
2,2,1.0,Top-Down Construction of 3-D Mechanical Object...,First Page of the Article,1984.0,Kikuo Fujimura,5608ec4a45cedb3396db2920,"Univ. of Tokyo, Tokyo, Japan"
3,3,1.0,Top-Down Construction of 3-D Mechanical Object...,First Page of the Article,1984.0,Tosiyasu L. Kunii,548a62d3dabfae8a11fb49e2,"Univ. of Tokyo, Tokyo, Japan"
4,4,2.0,Deriving A Compiler From An Operational Semant...,This paper addresses the issue of compiler cor...,1985.0,S Mazaher,,"UNIV CALIF LOS ANGELES,DEPT COMP SCI,LOS ANGEL..."
...,...,...,...,...,...,...,...,...
19085590,19085589,5609480.0,Learning through play: design and creation of ...,Digital didactic games are promising tools to ...,2023.0,Bruno Rodrigues Vieira,,"PPGEd, Universidade Federal dos Vales do Jequi..."
19085591,19085590,5609480.0,Learning through play: design and creation of ...,Digital didactic games are promising tools to ...,2023.0,Caroline Queiroz Santos,562b08db45cedb3398969350,"PPGEd e Departamento de Computação, Universida..."
19085592,19085591,5609481.0,Enhancing Learners' Performance in Contest Thr...,The fairness of vocational contest scoring is ...,2024.0,Zhilin Luo,,
19085593,19085592,5609481.0,Enhancing Learners' Performance in Contest Thr...,The fairness of vocational contest scoring is ...,2024.0,Xuefeng Shao,,


In [None]:
with open(AUTHORID_LIST_PATH, 'rb') as f:
    authorid_list = pickle.load(f)
    
with open(AUTHORID_TO_ID_PATH, 'rb') as f:
    authorid_to_id = pickle.load(f)
    
document_preprocessor = RegexTokenizer('\\w+')
stopwords = set()
with open(STOPWORD_PATH, "r") as f:
    for word in f:
        stopwords.add(word.strip())
        
author_index = Indexer.create_author_index(IndexType.InvertedIndex, PAPER_DATA_PATH,
                     document_preprocessor, stopwords, authorid_to_id, authorid_list, minimum_word_frequency=50, text_key="title")
author_index.save(AUTHOR_INDEX)