* code by Sihyun You (2021.12.28.)
* edit by Jehyun Lee (2021.12.30.)
* revised for mrnIF by Jehyun Lee (2022.01.08.)

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pybliometrics.scopus import ScopusSearch
from bs4 import BeautifulSoup
import requests, json
from docx import Document
from copy import deepcopy

In [2]:
# Scopus API Keys
from my_apikeys import APIKeys

In [3]:
# subscriber (Institute: True. Home: False)
try:
    s_sample = ScopusSearch(f"DOI (10.1038/s41598-021-83315-9)").results[0]
    subscriber=True
except:
    subscriber=False

print(f"subscriber={subscriber}")    

subscriber=True


In [4]:
def regularize_date_publication(_str):
    _str = _str.replace(",", "")
    token_date = _str.split(' ')
    if len(token_date) == 1: # year only
        year = _str
        s = _str
    elif (re.match('[0-9]', token_date[0])):
        day, month, year = token_date[0], token_date[1][:3].upper(), token_date[2]
        s = ' '.join([month, day, year])
    elif (re.match('[A-Za-z]', token_date[0]) and re.match('[0-9]', token_date[1]) and int(token_date[1])<32):    
        month, day, year = token_date[0][:3].upper(), token_date[1], token_date[2]
        s = ' '.join([month, day, year])
    else:
        month, year = token_date[0][:3].upper(), token_date[1]
        s = ' '.join([month, year])
        
    return (int(year), s)

def get_pub_index(_pub_name, _df):
    list_title = _df['TITLE'].map(regularize)
    pub_index = np.where(regularize(_pub_name) == list_title)[0]
    if len(pub_index) > 0:
        return pub_index
    return np.array([])

def get_pub_index_eissn(_pub_eissn, _df):
    list_eissn = _df['EISSN'].str.replace("-","").values
    pub_index = np.where(_pub_eissn == list_eissn)[0]
    if len(pub_index) > 0:
        return pub_index
    return np.array([])

def regularize(_str):
    return re.sub('[^A-Za-z0-9]+', '', re.sub('&', 'and', _str)).lower()       

def regularize_space(_str):
    return re.sub('[^A-Za-z0-9]+', ' ', re.sub('&', 'and', _str)).lower()       

In [5]:
pd.set_option("mode.chained_assignment", None)
pd.set_option("display.max_columns", None)
dfs_JCR_SCIE = {}
YEAR_START, YEAR_REMARK, YEAR_THIS = 2016, 2020, 2021

In [6]:
for y in range(YEAR_START, YEAR_THIS):
    print(f"{y}년도 시트를 로딩중입니다.")
    dfs_JCR_SCIE.update({str(y):pd.read_excel("./data/JCR_SCIE_(2016-2020)_merged.xlsx", sheet_name=f"JCR {y}")})

2016년도 시트를 로딩중입니다.
2017년도 시트를 로딩중입니다.
2018년도 시트를 로딩중입니다.
2019년도 시트를 로딩중입니다.
2020년도 시트를 로딩중입니다.


In [7]:
# EISSN 인가
for k in dfs_JCR_SCIE.keys():
    col_capitals = np.array([c.upper() for c in dfs_JCR_SCIE[k].columns])
    if "EISSN" in col_capitals:
        idx_eissn = np.where("EISSN" == col_capitals)[0][0]
        dfs_JCR_SCIE[k] = dfs_JCR_SCIE[k].rename(columns={dfs_JCR_SCIE[k].columns[idx_eissn]:"EISSN"})

EISSN_2019 = dfs_JCR_SCIE["2019"][["Title20", "ISO_ABBREV", "TITLE", "ISSN", "EISSN"]]
EISSN_2020 = dfs_JCR_SCIE["2020"][["Title20", "ISO_ABBREV", "TITLE", "ISSN", "EISSN"]]
EISSN = pd.concat([EISSN_2019, EISSN_2020], axis=0).drop_duplicates("ISSN")

for k in dfs_JCR_SCIE.keys():
    col_capitals = np.array([c.upper() for c in dfs_JCR_SCIE[k].columns])
    if "EISSN" not in col_capitals:
        dfs_JCR_SCIE[k] = dfs_JCR_SCIE[k].merge(EISSN, how="left", on=["Title20", "ISO_ABBREV", "TITLE", "ISSN"])

In [8]:
dfs_JCR_SCIE.keys()

dict_keys(['2016', '2017', '2018', '2019', '2020'])

In [15]:
df_applicants = pd.read_excel("./data/HR_input.xlsx", header=1, dtype={"UT":str})

# display example
df_applicants.head(3)

Unnamed: 0,전체순번,수험번호내 순번,수험번호,이름,영문명,논문제목,게재일자,지원자 입력 DOI,수정 DOI,SCIE구분,역할,게재지명,출판사,ISSN,논문구분\n(SCIE),Publication Date,#citation,Publication Year journal impact factor,2020\njournal\nimpact\nfactor,2020 journal impact factor percentile,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue,Notes
0,1409,2,0088-000276,김재형,"Kim, Jae Hyung",A General Strategy to Atomically Dispersed Pre...,2020.01.30,/10.1021/acsnano.9b08494,10.1021/acsnano.9b08494,국외SCIE,주저자,"ACS Nano 2020, 14, 1990-2001.",ACS,-,,,,,,,,,,,,,,
1,683,3,0088-000129,유정원,"Yoo, Jeongwon",Investigation of intrinsic toroidal rotation s...,2017.07.12,/10.1063/1.4991397,10.1063/1.4991397,국외SCIE,주저자,"PHYSICS OF PLASMAS 24, 072510 (2017)",AIP Publishing,-,,,,,,,,,,,,,,
2,684,4,0088-000129,유정원,"Yoo, Jeongwon",Experimental evidence of intrinsic ohmic rotat...,2018.04.25,/10.1063/1.5026905,10.1063/1.5026905,국외SCIE,제2저자,"Phys. Plasmas 25, 044502 (2018);",AIP Publishing,-,,,,,,,,,,,,,,


In [16]:
list_title = df_applicants["논문제목"].values
list_name_kr = df_applicants["이름"].values
list_name_en = df_applicants["영문명"].values
list_doi = df_applicants["수정 DOI"].values

In [17]:
print(len(list_doi), len(list_title))

1528 1528


In [18]:
df_applicants.columns

Index(['전체순번', '수험번호내 순번', '수험번호', '이름', '영문명', '논문제목', '게재일자', '지원자 입력 DOI',
       '수정 DOI', 'SCIE구분', '역할', '게재지명', '출판사', 'ISSN', '논문구분\n(SCIE)',
       'Publication Date', '#citation',
       'Publication Year journal impact factor',
       '2020\njournal\nimpact\nfactor',
       '2020 journal impact factor percentile', '1st Author',
       '1ST AUTHOR\n(Y/N)', 'Reprint Author', 'REPRINT AUTHOR\n(Y/N)',
       'Source\n(Journal)', 'volume', 'issue', 'Notes'],
      dtype='object')

In [22]:
%%time
xmls = []

for i, (doi, title) in enumerate(zip(list_doi, list_title)):
    
    # iteration
    if i%100 == 0 and i > 0:
        print(f"# working on {i}th article....")
        
    # notes
    notes = []
    
    # article
    s = ScopusSearch(f"DOI ({doi})", download=True, subscriber=subscriber).results
    if s == None:
        try:
            s = ScopusSearch(f"TITLE ({title})", download=True, subscriber=subscriber).results
            if s != None:
                doi = s[0].doi
                df_applicants["Notes"][i] = f"DOI 오류: {doi}"
            else:
                print(f"- No.{i} is invalid.")
                notes.append("doi 및 title 확인 필요")
                df_applicants["Notes"][i] = "\n".join(notes)
                continue
        except:
            print(f"- No.{i} is not accessible in Scopus.")
            notes.append("Scopus에서 접근 불가. 확인 필요")
            df_applicants["Notes"][i] = "\n".join(notes)
            continue
            
        
    info_scopus = s[0]

    index_remark = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(YEAR_REMARK)])
    if len(index_remark) == 0:
        index_remark = get_pub_index_eissn(info_scopus.eIssn, dfs_JCR_SCIE[str(YEAR_REMARK)])

    scie_yn = 'N'
    if len(index_remark) > 0:
        scie_yn = 'Y'
    else:
        scie_yn = 'N'
        notes.append("JCR 목록에 없음")
        df_applicants["Notes"][i] = "\n".join(notes)
        continue
    
    df_applicants["논문구분\n(SCIE)"][i] = scie_yn
    
    year, date = regularize_date_publication(info_scopus.coverDisplayDate)
    df_applicants["Publication Date"][i] = date
    df_applicants["#citation"][i] = str(info_scopus.citedby_count)
    
    if year < YEAR_START:
        notes.append(f"{YEAR_START-1} 이전 논문")
        df_applicants["Notes"][i] = "\n".join(notes)
        continue
    
    elif year < YEAR_THIS:
        list_n = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(year)])
        if len(list_n) > 0:
            index_n = list_n[0]
        else:
            list_n = get_pub_index_eissn(info_scopus.eIssn, dfs_JCR_SCIE[str(year)])
            if len(list_n) > 0:
                index_n = list_n[0]
            else:
                notes.append(f"IF를 찾지 못함")
                df_applicants["Notes"][i] = "\n".join(notes)
                continue
        jif_n = str(dfs_JCR_SCIE[str(year)]["IMPACT_FACTOR"][index_n])
    else:
        jif_n = f"{YEAR_THIS}년 출판으로 IF미발급"
        
    df_applicants["Publication Year journal impact factor"][i] = jif_n
    df_applicants["2020\njournal\nimpact\nfactor"][i] = str(dfs_JCR_SCIE[str(YEAR_REMARK)]["IMPACT_FACTOR"][index_remark[0]])
    
    ifp = dfs_JCR_SCIE[str(YEAR_REMARK)].loc[index_remark, "mrnIF"]
    try: 
        df_applicants["2020 journal impact factor percentile"][i] = str(ifp.values[0])
    except:
        df_applicants["2020 journal impact factor percentile"][i] = "mrnIF 없음"
    
    ### Author data
    authors_raw = np.array([regularize(n) for n in info_scopus.author_names.split(";")])
    
    list_name_eni = list_name_en[i]
    if "," not in list_name_eni: # 이름 성 
        list_name_eni_ = deepcopy(list_name_eni)
        list_name_eni_ = list_name_eni_.split(" ")
        list_name_eni = ", ".join(list_name_eni_[1:] + list_name_eni_[:1])
        
    try:
        idx_author = np.where(regularize(list_name_eni) == authors_raw)[0][0]
    except IndexError: # swap family and last name
        try:
            idx_author = np.where(regularize("".join(list_name_eni.split(",")[::-1])) == authors_raw)[0][0]
        except IndexError : # possibly disturbed by middle names
            idx_author=None
            author_name_words = regularize_space(list_name_eni).split(" ")
            name_words = [regularize_space(name).rstrip(" ").split(" ") for name in info_scopus.author_names.split(";")]
            for j, name_word in enumerate(name_words):
                name_check = list(set(author_name_words) - set(name_word))
                if len(name_check) == 0:
                    idx_author = j
                    notes.append("지원자 성명 확인 필요")
                    
    if idx_author == None:
        # 이니셜로만 되어있는건 아닌지 확인
        familyname = list_name_eni.split(", ")[0].lower()
        firstname = list_name_eni.split(", ")[1].lower()
        name_TF = [True if ((n[0] == familyname and len(set(n[1:])-set(firstname))==0) or 
                            (n[-1]==familyname and len(set(n[:-1])-set(firstname))==0)) 
                   else False 
                   for n in name_words]
        if len(np.where(np.array(name_TF)==True)[0]) > 0:
            idx_author = np.where(np.array(name_TF)==True)[0][0]
            notes.append("논문 저자 이름이 약어로 표현됨. 확인 필요.")
        else:
            notes.append("지원자가 저자 명단에 없음")
            
        
    # first author    
    if idx_author == 0:
        first_author = list_name_eni
        first_author_yn = "Y"
    else:
        first_author = info_scopus.author_names.split(";")[0]
        first_author_yn = "N"    
    
    df_applicants["1st Author"][i] = first_author
    df_applicants["1ST AUTHOR\n(Y/N)"][i] = first_author_yn    
    df_applicants["1st Author"][i] = first_author
    
        
    ### Publication
    df_applicants["Source\n(Journal)"][i] = info_scopus.publicationName.upper()
    df_applicants["volume"][i] = info_scopus.volume
    
    if info_scopus.issueIdentifier != None:
        issue = info_scopus.issueIdentifier
    else:
        issue = ''
    df_applicants["issue"][i] = issue
    
    # PDF download
    accept = "application/pdf"
    HEADERS = {
        'X-ELS-APIKEY': APIKeys[-1],
        'Accept': accept
    }
    url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
    try:
        r = requests.get(url, stream=True, headers=HEADERS)
        if r.status_code == 200: # download supported in Scopus
            if accept == "application/pdf":
                for chunk in r.iter_content(chunk_size=1024*1024):
                    with open(f"./pdf/paper_{i+1}.pdf", "wb") as f:
                        f.write(chunk)
            elif accept == "text/xml":
                xml = BeautifulSoup(r.content, "html5lib")
                xmls.append(xml)
                with open(f"./xml/xml_{i+1}.xml", "w") as f:
                    f.write(xml.prettify())
            
        # XML data mining
        url = f"https://doi.org/{doi}"
        r = requests.get(url)
        xml = BeautifulSoup(r.content, "html5lib")
        xmls.append(xml)            
        with open(f"./xml/xml_{i+1}.xml", "w") as f:
            f.write(xml.prettify())
    except:
        pass

    # write in every step
    df_applicants["Notes"][i] = "\n".join(notes)
    df_applicants.to_excel("applicants_fill.xlsx", index=False)
    
print(len(xmls))


- No.57 is invalid.
- No.92 is not accessible in Scopus.
# working on 100th article....
# working on 200th article....
- No.210 is invalid.
- No.227 is invalid.
# working on 300th article....
- No.367 is invalid.
# working on 400th article....
- No.475 is invalid.
# working on 500th article....
- No.533 is invalid.
# working on 600th article....
- No.682 is invalid.
# working on 700th article....
- No.717 is invalid.
- No.788 is invalid.
- No.789 is invalid.
# working on 800th article....
- No.876 is invalid.
# working on 900th article....
- No.904 is invalid.
- No.926 is invalid.
- No.954 is not accessible in Scopus.
# working on 1000th article....
- No.1023 is not accessible in Scopus.
- No.1045 is invalid.
- No.1081 is invalid.
- No.1082 is not accessible in Scopus.
- No.1085 is not accessible in Scopus.
- No.1088 is invalid.
- No.1090 is invalid.
- No.1099 is invalid.
# working on 1100th article....
- No.1100 is invalid.
- No.1148 is invalid.
# working on 1200th article....
- No.12