* code by Sihyun You (2021.12.28.)
* edit by Jehyun Lee (2021.12.30.)
* revised for mrnIF by Jehyun Lee (2022.01.08.)

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pybliometrics.scopus import ScopusSearch
from bs4 import BeautifulSoup
import requests, json
from docx import Document
from copy import deepcopy

In [2]:
# Scopus API Keys
from my_apikeys import APIKeys

In [3]:
# subscriber (Institute: True. Home: False)
try:
    s_sample = ScopusSearch(f"DOI (10.1038/s41598-021-83315-9)").results[0]
    subscriber=True
except:
    subscriber=False

print(f"subscriber={subscriber}")    

subscriber=True


In [4]:
pd.set_option("mode.chained_assignment", None)
pd.set_option("display.max_columns", None)
dfs_JCR_SCIE = {}
YEAR_START, YEAR_REMARK, YEAR_THIS = 2016, 2020, 2021

In [5]:
for y in range(YEAR_START, YEAR_THIS):
    print(f"{y}년도 시트를 로딩중입니다.")
    dfs_JCR_SCIE.update({str(y):pd.read_excel("./data/JCR_SCIE_(2016-2020)_merged.xlsx", sheet_name=f"JCR {y}")})

2016년도 시트를 로딩중입니다.
2017년도 시트를 로딩중입니다.
2018년도 시트를 로딩중입니다.
2019년도 시트를 로딩중입니다.
2020년도 시트를 로딩중입니다.


In [30]:
def regularize_date_publication(_str):
    _str = _str.replace(",", "")
    token_date = _str.split(' ')
    if len(token_date) == 1: # year only
        year = _str
        s = _str
    elif (re.match('[0-9]', token_date[0])):
        day, month, year = token_date[0], token_date[1][:3].upper(), token_date[2]
        s = ' '.join([month, day, year])
    elif (re.match('[A-Za-z]', token_date[0]) and re.match('[0-9]', token_date[1]) and int(token_date[1])<32):    
        month, day, year = token_date[0][:3].upper(), token_date[1], token_date[2]
        s = ' '.join([month, day, year])
    else:
        month, year = token_date[0][:3].upper(), token_date[1]
        s = ' '.join([month, year])
        
    return (int(year), s)

def get_pub_index(_pub_name, _df):
    list_title = _df['TITLE'].map(regularize)
    pub_index = np.where(regularize(_pub_name) == list_title)[0]
    if len(pub_index) > 0:
        return pub_index
    return np.array([])

def get_pub_index_eissn(_pub_eissn, _df):
    list_eissn = _df['EISSN'].str.replace("-","").values
    pub_index = np.where(_pub_eissn == list_eissn)[0]
    if len(pub_index) > 0:
        return pub_index
    return np.array([])

def regularize(_str):
    return re.sub('[^A-Za-z0-9]+', '', re.sub('&', 'and', _str)).lower()       

def regularize_space(_str):
    return re.sub('[^A-Za-z0-9]+', ' ', re.sub('&', 'and', _str)).lower()       

In [55]:
# find the most frequent element
def most_frequent(List):
    return max(set(List), key = List.count)

In [6]:
df_applicants = pd.read_excel("./data/HR_input.xlsx", header=1, dtype={"UT":str})

# display example
df_applicants.head(3)

Unnamed: 0,전체순번,수험번호내 순번,수험번호,이름,영문명,논문제목,게재일자,지원자 입력 DOI,수정 DOI,SCIE구분,역할,게재지명,출판사,ISSN,논문구분\n(SCIE),Publication Date,#citation,Publication Year journal impact factor,2020\njournal\nimpact\nfactor,2020 journal impact factor percentile,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue,Notes
0,1409,2,0088-000276,김재형,"Kim, Jae Hyung",A General Strategy to Atomically Dispersed Pre...,2020.01.30,/10.1021/acsnano.9b08494,10.1021/acsnano.9b08494,국외SCIE,주저자,"ACS Nano 2020, 14, 1990-2001.",ACS,-,,,,,,,,,,,,,,
1,683,3,0088-000129,유정원,"Yoo, Jeongwon",Investigation of intrinsic toroidal rotation s...,2017.07.12,/10.1063/1.4991397,10.1063/1.4991397,국외SCIE,주저자,"PHYSICS OF PLASMAS 24, 072510 (2017)",AIP Publishing,-,,,,,,,,,,,,,,
2,684,4,0088-000129,유정원,"Yoo, Jeongwon",Experimental evidence of intrinsic ohmic rotat...,2018.04.25,/10.1063/1.5026905,10.1063/1.5026905,국외SCIE,제2저자,"Phys. Plasmas 25, 044502 (2018);",AIP Publishing,-,,,,,,,,,,,,,,


In [26]:
# 이름으로 정렬
df_applicants = df_applicants.sort_values(["이름", "영문명"])
gdf_applicants = df_applicants[["수험번호", "이름", "영문명", "논문제목", "수정 DOI"]].groupby(["수험번호", "이름", "영문명"]).agg(list).reset_index()
gdf_applicants.head(3)

Unnamed: 0,수험번호,이름,영문명,논문제목,수정 DOI
0,0088-000002,고윤지,"Ko, Younji",[Room-Temperature Metallic Fusion-Induced Laye...,"[10.1002/adfm.201806584, 10.1002/adfm.20210253..."
1,0088-000003,백승준,"Baik, Seungjoon",[A concept design of supercritical CO2 cooled ...,"[10.1002/er.3633, 10.1016/j.apenergy.2017.08.0..."
2,0088-000005,송준호,"Song, Junho",[Time~Frequency Mask Estimation Based on Deep ...,"[10.1109/TSG.2021.3066547, 10.5626/KTCP.2019.2..."


In [60]:
# author data 추출
list_names= gdf_applicants["영문명"].values
list_titles= gdf_applicants["논문제목"].values
list_dois= gdf_applicants["수정 DOI"].values

n_applicants = gdf_applicants.shape[0]

In [62]:
df_personal = pd.DataFrame({"수험번호": gdf_applicants["수험번호"].values,
                            "이름": gdf_applicants["이름"].values,
                            "영문명": list_names,
                            "Scopus Author ID": [np.nan] * n_applicants,
                            "h-index": [np.nan] * n_applicants,
                            "총 출판물수": [np.nan] * n_applicants,
                            "총 피인용수": [np.nan] * n_applicants,
                            "Notes": [np.nan] * n_applicants,
                           })

for i, (name, titles, dois) in enumerate(zip(list_names, list_titles, list_dois)):

    # find article for the author
    list_author_ids = []
    for title, doi in zip(titles, dois):
        s = ScopusSearch(f"DOI ({doi})", download=True, subscriber=subscriber).results
        if s == None:
            s = ScopusSearch(f"TITLE ({title})", download=True, subscriber=subscriber).results
            if s != None:
                info_scopus = s[0]
            else:
                continue
        else:
            info_scopus = s[0]
        
        ### Author data
        authors_raw = np.array([regularize(n) for n in info_scopus.author_names.split(";")])
        author_ids = info_scopus.author_ids.split(";")
    
        try:
            idx_author = np.where(regularize(name) == authors_raw)[0][0]
        except IndexError: # swap family and last name
            try:
                idx_author = np.where(regularize("".join(name.split(",")[::-1])) == authors_raw)[0][0]
            except IndexError : # possibly disturbed by middle names
                idx_author=None
                author_name_words = regularize_space(name).split(" ")
                name_words = [regularize_space(n).split(" ") for n in info_scopus.author_names.split(";")]
                for j, name_word in enumerate(name_words):
                    name_check = list(set(author_name_words) - set(name_word))
                    if len(name_check) == 0:
                        idx_author = j

        if idx_author != None:
            list_author_ids.append(author_ids[idx_author])

    if len(list_author_ids) == 0:
        h_index = 0
        document_count_total = 0
        citedby_count_total = 0
        
    author_id_final = most_frequent(list_author_ids)
    
    # personal records
    author_r = requests.get(f"http://api.elsevier.com/content/author?author_id={author_id_final}&view=metrics",
                            headers={'Accept':'application/json', 'X-ELS-APIKey': APIKeys[-2]})
    author_data = eval(json.dumps(author_r.json(), sort_keys=True, indent=4, separators=(',', ': ')))
    h_index = author_data['author-retrieval-response'][0]["h-index"]
    citedby_count_total = author_data['author-retrieval-response'][0]["coredata"]['cited-by-count']
    document_count_total = author_data['author-retrieval-response'][0]["coredata"]['document-count']
    
    # write to file
    df_personal["h-index"][i] = h_index
    df_personal['총 출판물수'][i] = document_count_total
    df_personal['총 피인용수'][i] = citedby_count_total
    
df_personal.to_excel("applicants_personal.xlsx", index=False)

ValueError: max() arg is an empty sequence

In [63]:
i

9

In [64]:
name

'Kwon, Daeyong'

In [65]:
list_author_ids

[]

In [66]:
gdf_applicants.loc[9]

수험번호                                            0088-000014
이름                                                      권대용
영문명                                           Kwon, Daeyong
논문제목      [외부 메모리 기반 대용량 점군 데이터의 가시화 및 편집, 선박 및 플랜트 구조물 ...
수정 DOI         [10.7315/CDE.2020.267, 10.7315/CDE.2020.406]
Name: 9, dtype: object