* code by Sihyun You (2021.12.28.)
* edit by Jehyun Lee (2021.12.30.)
* revised for mrnIF by Jehyun Lee (2022.01.08.)
* bugfix for mrnIF by Jehyun Lee (2022.01.20.)
* using applicants submitted Eng. name (2022.06.17.)

# Environment settings

* import libraries

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pybliometrics.scopus import ScopusSearch
from bs4 import BeautifulSoup
import requests, json
from docx import Document
from copy import deepcopy

* Scopus API Keys

In [2]:
from my_apikeys import APIKeys

# check if I can access Scopus API
try:
    s_sample = ScopusSearch(f"DOI (10.1021/acsnano.9b08494)").results[0]
    subscriber=True
except:
    subscriber=False

print(f"subscriber={subscriber}")    

subscriber=True


* settings

In [3]:
pd.set_option("mode.chained_assignment", None)
pd.set_option("display.max_columns", None)

# Ready for retrieving

* **IMPORTANT: Retrieving Range**

In [4]:
# 확보 JCR 데이터 범위
YEAR_START = 2016
YEAR_END = 2020    # 포함

# 논문 평가 기준 년도
YEAR_REMARK = 2020

* JCR data load

In [5]:
dfs_JCR_SCIE = {}
data_JCR = "./data/JCR_SCIE_(2016-2020)_merged.xlsx"

for y in range(YEAR_START, YEAR_END+1):
    print(f"{y}년도 시트를 로딩중입니다.")
    dfs_JCR_SCIE.update({str(y):pd.read_excel(data_JCR, sheet_name=f"JCR {y}")})

2016년도 시트를 로딩중입니다.
2017년도 시트를 로딩중입니다.
2018년도 시트를 로딩중입니다.
2019년도 시트를 로딩중입니다.
2020년도 시트를 로딩중입니다.


* 확보한 JCR data에 2019, 2020년 기준으로 EISSN 데이터 추가

In [6]:
# EISSN 인가
for k in dfs_JCR_SCIE.keys():
    col_capitals = np.array([c.upper() for c in dfs_JCR_SCIE[k].columns])
    if "EISSN" in col_capitals:
        idx_eissn = np.where("EISSN" == col_capitals)[0][0]
        dfs_JCR_SCIE[k] = dfs_JCR_SCIE[k].rename(columns={dfs_JCR_SCIE[k].columns[idx_eissn]:"EISSN"})

EISSN_2019 = dfs_JCR_SCIE["2019"][["Title20", "ISO_ABBREV", "TITLE", "ISSN", "EISSN"]]
EISSN_2020 = dfs_JCR_SCIE["2020"][["Title20", "ISO_ABBREV", "TITLE", "ISSN", "EISSN"]]
EISSN = pd.concat([EISSN_2019, EISSN_2020], axis=0).drop_duplicates("ISSN")

for k in dfs_JCR_SCIE.keys():
    col_capitals = np.array([c.upper() for c in dfs_JCR_SCIE[k].columns])
    if "EISSN" not in col_capitals:
        dfs_JCR_SCIE[k] = dfs_JCR_SCIE[k].merge(EISSN, how="left", on=["Title20", "ISO_ABBREV", "TITLE", "ISSN"])

* natural language processor

In [7]:
def regularize_date_publication(_str):
    _str = _str.replace(",", "")
    token_date = _str.split(' ')
    if len(token_date) == 1: # year only
        year = _str
        s = _str
    elif (re.match('[0-9]', token_date[0])):
        day, month, year = token_date[0], token_date[1][:3].upper(), token_date[2]
        s = ' '.join([month, day, year])
        
    # 'August 26, 2021' -> 'AUG 26 2021'
    elif (re.match('[A-Za-z]', token_date[0]) and re.match('[0-9]', token_date[1]) and int(token_date[1])<32):    
        month, day, year = token_date[0][:3].upper(), token_date[1], token_date[2]
        s = ' '.join([month, day, year])
    else:
        month, year = token_date[0][:3].upper(), token_date[1]
        s = ' '.join([month, year])
        
    return (int(year), s)

def get_pub_index(_pub_name, _df):
    list_title = _df['TITLE'].map(regularize)
    pub_index = np.where(regularize(_pub_name) == list_title)[0]
    if len(pub_index) > 0:
        return pub_index
    else:
        return np.array([])

def get_pub_index_eissn(_pub_eissn, _df):
    list_eissn = _df['EISSN'].str.replace("-","").values
    pub_index = np.where(_pub_eissn == list_eissn)[0]
    if len(pub_index) > 0:
        return pub_index
    return np.array([])

def regularize(_str):
    return re.sub('[^A-Za-z0-9]+', '', re.sub('&', 'and', _str)).lower()       

def regularize_space(_str):
    return re.sub('[^A-Za-z0-9]+', ' ', re.sub('&', 'and', _str)).lower()       

# Working on Applicants data

## Scopus

* 지원자 데이터 불러오기

In [310]:
data_applicants = "./220616_2nd/validation_source.xlsx"

df_applicants = pd.read_excel(data_applicants, header=1, dtype={"UT":str})

# display example
print(df_applicants.columns)
df_applicants.head(3)
print(df_applicants.shape)

Index(['전체순번', '수험번호 내 순번', '수험번호', '성명', '영문명', '논문 내 영문명', '논문명', '게재일자',
       'DOI', '저널구분', '저자구분', '저널명', '출판사', 'ISSN', '저자순위', '게재여부',
       '논문구분\n(SCIE)', 'Publication Date', '#citation',
       'Publication Year journal impact factor',
       '2020\njournal\nimpact\nfactor',
       '2020 journal impact factor percentile', '1st Author',
       '1ST AUTHOR\n(Y/N)', 'Reprint Author', 'REPRINT AUTHOR\n(Y/N)',
       'Source\n(Journal)', 'volume', 'issue', 'Notes'],
      dtype='object')
(1621, 30)


* 논문 내 영문명 결측치를 영문명으로 메움

In [311]:
idx_name_en_missing = np.where(df_applicants["논문 내 영문명"].isna())[0]
df_applicants["논문 내 영문명"].loc[idx_name_en_missing] = df_applicants["영문명"].loc[idx_name_en_missing]

* paper data retrieve

In [312]:
# %%time

# 중도 중단시 재개 번호
start = 1587
end = df_applicants.shape[0]
end = 1588

for i in range(start, end):
    
    # verbose: iteration
    if i%100 == 0 and i > 0:
        print(f"# working on {i}th article....")
    
    # 중도 중단 지점부터만 시작
    if i < start:
        continue     
    
    # 1 line DataFrame
    df = df_applicants.loc[i]
    
    # data extraction.
    doi = df["DOI"]
    print(f"{doi}")
    if isinstance(doi, str):
        doi = doi.lstrip("https://doi.org/")
    
    title = df["논문명"]
    no = df["전체순번"]
    
    ### article retrieval
    notes = []    # 
    s = ScopusSearch(f"DOI ({doi})", download=True, subscriber=subscriber).results
    if s == None:
        try:
            s = ScopusSearch(f"TITLE ({title})", download=True, subscriber=subscriber).results
            # DOI로는 검색 불가. title로 검색 성공
            if s != None:
                doi = s[0].doi
                notes.append(f"DOI 오류: {doi}")
            # DOI, title로 모두 검색 불가
            else:
                print(f"- 전체순번 {no} is invalid.")
                notes.append("Scopus에서 doi 및 title 검색 불가: 확인 필요")
                df_applicants["Notes"].loc[i] = "\n".join(notes)
                continue
        # Scopus에서 접근 자체가 불가
        except:
            print(f"- No.{i} is not accessible in Scopus.")
            notes.append("Scopus에서 접근 불가. 확인 필요")
            df_applicants["Notes"].loc[i] = "\n".join(notes)
            continue     
    
    info_scopus = s[0]
        
    ### SCIE 여부 (Y/N)
    # index of publication at JCR data
    index_remark = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(YEAR_REMARK)])
    if len(index_remark) == 0: # publicationName으로 찾지 못하면 EISSN으로 검색
        index_remark = get_pub_index_eissn(info_scopus.eIssn, dfs_JCR_SCIE[str(YEAR_REMARK)])
    
    # decision
    if len(index_remark) > 0:
        df_applicants["논문구분\n(SCIE)"].loc[i] = "Y"
    else:
        df_applicants["논문구분\n(SCIE)"].loc[i] = "N"
        notes.append("JCR 목록에 없음")
        df_applicants["Notes"].loc[i] = "\n".join(notes)
        
    ### Publication Date
    year, date = regularize_date_publication(info_scopus.coverDisplayDate)
    df_applicants["Publication Date"].loc[i] = date
    df_applicants["#citation"].loc[i] = str(info_scopus.citedby_count)

    ### Impact Factor @Publication year and @remark year
    if year < YEAR_START:
        notes.append(f"{YEAR_START-1} 이전 논문")
        df_applicants["Notes"].loc[i] = "\n".join(notes)
    
    elif year <= YEAR_END:
        index_pub = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(year)])
        if len(index_pub) > 0:
            index_pub = index_pub[0]
            jif_n = str(dfs_JCR_SCIE[str(year)]["IMPACT_FACTOR"][index_pub])
        else:
            index_pub = get_pub_index_eissn(info_scopus.eIssn, dfs_JCR_SCIE[str(year)])
            if len(index_pub) > 0:
                index_pub = index_pub[0]
                jif_n = str(dfs_JCR_SCIE[str(year)]["IMPACT_FACTOR"][index_pub])
            else:
                # JCR Data에 해당 publication이 없음
                notes.append(f"IF를 찾지 못함")
                df_applicants["Notes"].loc[i] = "\n".join(notes)
                jif_n = f"IF 미발급: JCR Data에 없음"

    else:
        jif_n = f"IF 미발급: {year}년 출판"
    
    df_applicants["Publication Year journal impact factor"].loc[i] = jif_n

    ### Impact Factor Percentile @remark year
    if index_remark.size > 0:
        df_applicants["2020\njournal\nimpact\nfactor"].loc[i] = str(dfs_JCR_SCIE["2020"]["IMPACT_FACTOR"][index_remark[0]])
        ifp = dfs_JCR_SCIE[str(YEAR_REMARK)].loc[index_remark, "JIF_PERCENTILE"]
        df_applicants["2020 journal impact factor percentile"].loc[i] = str(ifp.values[0])
    else:
        df_applicants["2020 journal impact factor percentile"].loc[i] = "mrnIF 없음"
    
    ### Author data
    # authors in paper
    if info_scopus.author_names:
        authors_raw = info_scopus.author_names.split(";")
        authors_reg = [regularize(n) for n in authors_raw]

        # applicants name
        name = df["영문명"]
        name_paper = df["논문 내 영문명"]
        name_swap = ", ".join(name.split(" ")[1:] + name.split(" ")[:1]).rstrip(",")
        name_paper_swap = " ".join([name_paper.split(" ")[-1]] + name_paper.split(" ")[:-1])
        name_reg = regularize(df["영문명"])
        name_paper_reg = regularize(df["논문 내 영문명"])
        name_swap_reg = regularize(", ".join(name[1:] + name[:1]))
        name_paper_swap_reg = regularize(name_paper_swap)

        # name matching. 우선순위대로
        if any([name_paper in a for a in authors_raw]):
            idx_author = [name_paper in a for a in authors_raw].index(True)
        elif any([name in a for a in authors_raw]):
            idx_author = [name in a for a in authors_raw].index(True)
        elif any([name_swap in a for a in authors_raw]):
            idx_author = [name_swap in a for a in authors_raw].index(True)
        elif any([name_paper_swap in a for a in authors_raw]):
            idx_author = [name_paper_swap in a for a in authors_raw].index(True)
        elif any([name_paper_reg in a for a in authors_reg]):
            idx_author = [name_paper_reg in a for a in authors_reg].index(True)
        elif any([name_reg in a for a in authors_reg]):
            idx_author = [name_reg in a for a in authors_reg].index(True)
        elif any([name_swap_reg in a for a in authors_reg]):
            idx_author = [name_swap_reg in a for a in authors_reg].index(True)
        elif any([name_paper_swap_reg in a for a in authors_reg]):
            idx_author = [name_paper_swap_reg in a for a in authors_reg].index(True)
        else:    # 이니셜로만 되어 있는 것은 아닌지 확인
            if "," in name:  # 이름에 ","가 있는 경우 : "성, 이름"으로 구성되어 있는 것으로 가정
                familyname, givenname = name.split(", ")[0].lower(), name.split(", ")[1].lower()
            else:            # 이름에 ","가 없는 경우 : "이름 성"으로 구성되어 있는 것으로 가정.
                familyname, givenname = name.split(" ")[-1].lower(), "".join(name.split(" ")[:-1]).lower()

            name_TF = [True if ((n[0] == familyname and len(set(n[1:])-set(givenname))==0) or 
                                (n[-1]== familyname and len(set(n[:-1])-set(givenname))==0)) 
                       else False 
                       for n in authors_reg]
            name_scopus_err_reg = regularize(" ".join([*givenname.split(" ")[1:], familyname, givenname.split(" ")[0]]))

            if len(np.where(np.array(name_TF)==True)[0]) > 0:
                idx_author = np.where(np.array(name_TF)==True)[0][0]
                notes.append("확인 필요: 논문 저자 이름이 약어로 표현됨. 확인 필요.")

            # Lim, Suk Hyun이 Scopus DB에 Hyun Lim, Suk으로 입력된 사례: 10.1016/j.fuel.2021.122481
            elif name_scopus_err_reg in authors_reg:
                idx_author = authors_reg.index(name_scopus_err_reg)
                notes.append("확인 필요: ScopusDB에 이름이 한 글자씩 밀려서 잘못 입력된 듯함.")
            else:
                idx_author = None
                notes.append("확인 필요: 저자 명단에서 지원자를 찾지 못함.")

        # first author
        first_author = authors_raw[0]
        if idx_author == 0:
            first_author_yn = "Y"
        else:
            first_author_yn = "N"

        df_applicants["1st Author"].loc[i] = first_author
        df_applicants["1ST AUTHOR\n(Y/N)"].loc[i] = first_author_yn
    
    ### Publication
    df_applicants["Source\n(Journal)"].loc[i] = info_scopus.publicationName.upper()
    df_applicants["volume"].loc[i] = info_scopus.volume
    
    if info_scopus.issueIdentifier != None:
        issue = info_scopus.issueIdentifier
    else:
        issue = ''
    df_applicants["issue"].loc[i] = issue
    
    # write in every step
    df_applicants["Notes"].loc[i] = "\n".join(notes)
    df_applicants.to_excel(f"applicants_fill_from{start}_to{end}.xlsx", index=False)

10.1016/j.jechem.2021.09.004


## Semantic Scholar

In [300]:
df_applicants = pd.read_excel(f"applicants_fill_from{start}_to{end}.xlsx")
df_applicants.head(3)

Unnamed: 0,전체순번,수험번호 내 순번,수험번호,성명,영문명,논문 내 영문명,논문명,게재일자,DOI,저널구분,저자구분,저널명,출판사,ISSN,저자순위,게재여부,논문구분\n(SCIE),Publication Date,#citation,Publication Year journal impact factor,2020\njournal\nimpact\nfactor,2020 journal impact factor percentile,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue,Notes
0,1,1,0100-000001,고윤지,"Ko, Younji",Younji Ko,Layer-by-Layer Assembly-Based Electrocatalytic...,2021.08.26,10.1002/adfm.202102530,국외SCIE,주저자,"Advanced Functional Materials 31, 2102530, 2021",Wiley-VCH,1616-301X,(1/11),게재,Y,AUG 26 2021,0.0,IF 미발급: 2021년 출판,18.808,95.6586826,"Ko, Younji",Y,,,ADVANCED FUNCTIONAL MATERIALS,31.0,35.0,
1,2,2,0100-000001,고윤지,"Ko, Younji",Younji Ko,Hydrophobic and Hydrophilic Nanosheet Catalyst...,2018.04.01,10.1016/j.apsusc.2017.12.037,국외SCIE,주저자,"Applied Surface Science 436, 791, 2018",Elsevier,0169-4332,(1/4),게재,Y,APR 1 2018,3.0,5.155,6.707,97.6190476,"Ko, Younji",Y,,,APPLIED SURFACE SCIENCE,436.0,,
2,3,3,0100-000001,고윤지,"Ko, Younji",Younji Ko,Carbonization/Interfacial Assembly-Driven Elec...,2022.05.10,Under Revision,국외SCIE,주저자,Energy & Environmental Science,Royal Society of Chemistry,-,(2/5),제출,,,,,,,,,,,,,,Scopus에서 doi 및 title 검색 불가: 확인 필요


In [302]:
# Notes에 "Scopus에서 doi 및 title 검색 불가: 확인 필요"라고 적힌 것들만 대상.
df_2nd = df_applicants.loc[df_applicants["Notes"].str.contains("불가")==True]
print(df_2nd.shape)
df_2nd.head(3)

(118, 30)


Unnamed: 0,전체순번,수험번호 내 순번,수험번호,성명,영문명,논문 내 영문명,논문명,게재일자,DOI,저널구분,저자구분,저널명,출판사,ISSN,저자순위,게재여부,논문구분\n(SCIE),Publication Date,#citation,Publication Year journal impact factor,2020\njournal\nimpact\nfactor,2020 journal impact factor percentile,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue,Notes
2,3,3,0100-000001,고윤지,"Ko, Younji",Younji Ko,Carbonization/Interfacial Assembly-Driven Elec...,2022.05.10,Under Revision,국외SCIE,주저자,Energy & Environmental Science,Royal Society of Chemistry,-,(2/5),제출,,,,,,,,,,,,,,Scopus에서 doi 및 title 검색 불가: 확인 필요
101,102,5,0100-000036,최백범,"Choi, Baeck",Baeck Choi,Developing High-Performance Polymer Electrolyt...,2021.06.30,10.18770/KEPCO.2021.06.30.001,국내 일반학술지,주저자,KEPCO Journal on Electric Power and Energy 7(1...,한국전력공사,2466-0124,(1/7),게재,,,,,,,,,,,,,,Scopus에서 doi 및 title 검색 불가: 확인 필요
159,160,1,0100-000046,정성철,"Jeong, Seongcheol",Seongcheol Jeong,Robust Dynamic Output Feedback Event-Triggerin...,2022.05.09,10.1109/ACCESS.2022.3173648,국외SCIE,주저자,"IEEE Access, Volume: 10, Page(s) : 51261 - 512...",IEEE,2169-3536,(1/2),게재,,,,,,,,,,,,,,Scopus에서 doi 및 title 검색 불가: 확인 필요


### doi

In [303]:
start_2nd = 0
end_2nd = df_2nd.shape[0]

In [304]:
df.index

Index(['전체순번', '수험번호 내 순번', '수험번호', '성명', '영문명', '논문 내 영문명', '논문명', '게재일자',
       'DOI', '저널구분', '저자구분', '저널명', '출판사', 'ISSN', '저자순위', '게재여부',
       '논문구분\n(SCIE)', 'Publication Date', '#citation',
       'Publication Year journal impact factor',
       '2020\njournal\nimpact\nfactor',
       '2020 journal impact factor percentile', '1st Author',
       '1ST AUTHOR\n(Y/N)', 'Reprint Author', 'REPRINT AUTHOR\n(Y/N)',
       'Source\n(Journal)', 'volume', 'issue', 'Notes'],
      dtype='object')

In [305]:
for i in range(start_2nd, end_2nd):
    
    # verbose: iteration
    if i%100 == 0 and i > 0:
        print(f"# working on {i}th article....")
        
    df = df_2nd.iloc[i]
    
    idx = df.name
    doi = df["DOI"]
    notes = ["Scopus에서 doi 및 title 검색 불가: SemanticScholar 활용 시도"]
    URL = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=title,venue,year,citationCount,authors,tldr"
    try:
        response = requests.get(URL)

        if response.status_code:
            ### read data
            ss_info = response.json()
            title = ss_info["title"]
            publicationName = ss_info["venue"]
            year = ss_info["year"]
            citation = ss_info["citationCount"]
            tldr = ss_info["tldr"]
            authors_raw = [a["name"] for a in ss_info["authors"]]
            authors_reg = [regularize(n) for n in authors_raw]
            
            ### SCIE 여부 (Y/N)
            # index of publication at JCR data
            index_remark = get_pub_index(publicationName, dfs_JCR_SCIE[str(YEAR_REMARK)])

            # decision
            if len(index_remark) > 0:
                df_applicants["논문구분\n(SCIE)"].loc[idx] = "Y"
            else:
                df_applicants["논문구분\n(SCIE)"].loc[idx] = "N"
                notes.append("JCR 목록에 없음")
                df_applicants["Notes"].loc[idx] = "\n".join(notes)

            ### Publication Date
            df_applicants["Publication Date"].loc[idx] = year
            df_applicants["#citation"].loc[idx] = citation

            ### Impact Factor @Publication year and @remark year
            if year < YEAR_START:
                notes.append(f"{YEAR_START-1} 이전 논문")
                df_applicants["Notes"].loc[idx] = "\n".join(notes)

            elif year <= YEAR_END:
                index_pub = get_pub_index(publicationName, dfs_JCR_SCIE[str(year)])
                if len(index_pub) > 0:
                    index_pub = index_pub[0]
                    jif_n = str(dfs_JCR_SCIE[str(year)]["IMPACT_FACTOR"][index_pub])
                else:
                    # JCR Data에 해당 publication이 없음
                    notes.append(f"IF를 찾지 못함")
                    df_applicants["Notes"].loc[idx] = "\n".join(notes)
                    jif_n = f"IF 미발급: JCR Data에 없음"

            else:
                jif_n = f"IF 미발급: {year}년 출판"

            df_applicants["Publication Year journal impact factor"].loc[idx] = jif_n

            ### Impact Factor Percentile @remark year
            if index_remark.size > 0:
                df_applicants["2020\njournal\nimpact\nfactor"].loc[idx] = str(dfs_JCR_SCIE["2020"]["IMPACT_FACTOR"][index_remark[0]])
                ifp = dfs_JCR_SCIE[str(YEAR_REMARK)].loc[index_remark, "JIF_PERCENTILE"]
                df_applicants["2020 journal impact factor percentile"].loc[idx] = str(ifp.values[0])
            else:
                df_applicants["2020 journal impact factor percentile"].loc[idx] = "mrnIF 없음"
                
           ### Author data
            # authors in paper
            if info_scopus.author_names:
                
                # applicants name
                name = df["영문명"]
                name_paper = df["논문 내 영문명"]
                name_swap = ", ".join(name.split(" ")[1:] + name.split(" ")[:1]).rstrip(",")
                name_paper_swap = " ".join([name_paper.split(" ")[-1]] + name_paper.split(" ")[:-1])
                name_reg = regularize(df["영문명"])
                name_paper_reg = regularize(df["논문 내 영문명"])
                name_swap_reg = regularize(", ".join(name[1:] + name[:1]))
                name_paper_swap_reg = regularize(name_paper_swap)

                # name matching. 우선순위대로
                if any([name_paper in a for a in authors_raw]):
                    idx_author = [name_paper in a for a in authors_raw].index(True)
                elif any([name in a for a in authors_raw]):
                    idx_author = [name in a for a in authors_raw].index(True)
                elif any([name_swap in a for a in authors_raw]):
                    idx_author = [name_swap in a for a in authors_raw].index(True)
                elif any([name_paper_swap in a for a in authors_raw]):
                    idx_author = [name_paper_swap in a for a in authors_raw].index(True)
                elif any([name_paper_reg in a for a in authors_reg]):
                    idx_author = [name_paper_reg in a for a in authors_reg].index(True)
                elif any([name_reg in a for a in authors_reg]):
                    idx_author = [name_reg in a for a in authors_reg].index(True)
                elif any([name_swap_reg in a for a in authors_reg]):
                    idx_author = [name_swap_reg in a for a in authors_reg].index(True)
                elif any([name_paper_swap_reg in a for a in authors_reg]):
                    idx_author = [name_paper_swap_reg in a for a in authors_reg].index(True)
                else:    # 이니셜로만 되어 있는 것은 아닌지 확인
                    if "," in name:  # 이름에 ","가 있는 경우 : "성, 이름"으로 구성되어 있는 것으로 가정
                        familyname, givenname = name.split(", ")[0].lower(), name.split(", ")[1].lower()
                    else:            # 이름에 ","가 없는 경우 : "이름 성"으로 구성되어 있는 것으로 가정.
                        familyname, givenname = name.split(" ")[-1].lower(), "".join(name.split(" ")[:-1]).lower()

                    name_TF = [True if ((n[0] == familyname and len(set(n[1:])-set(givenname))==0) or 
                                        (n[-1]== familyname and len(set(n[:-1])-set(givenname))==0)) 
                               else False 
                               for n in authors_reg]
                    name_scopus_err_reg = regularize(" ".join([*givenname.split(" ")[1:], familyname, givenname.split(" ")[0]]))

                    if len(np.where(np.array(name_TF)==True)[0]) > 0:
                        idx_author = np.where(np.array(name_TF)==True)[0][0]
                        notes.append("확인 필요: 논문 저자 이름이 약어로 표현됨. 확인 필요.")

                    # Lim, Suk Hyun이 Scopus DB에 Hyun Lim, Suk으로 입력된 사례: 10.1016/j.fuel.2021.122481
                    elif name_scopus_err_reg in authors_reg:
                        idx_author = authors_reg.index(name_scopus_err_reg)
                        notes.append("확인 필요: ScopusDB에 이름이 한 글자씩 밀려서 잘못 입력된 듯함.")
                    else:
                        idx_author = None
                        notes.append("확인 필요: 저자 명단에서 지원자를 찾지 못함.")

                # first author
                first_author = authors_raw[0]
                if idx_author == 0:
                    first_author_yn = "Y"
                else:
                    first_author_yn = "N"

                df_applicants["1st Author"].loc[idx] = first_author
                df_applicants["1ST AUTHOR\n(Y/N)"].loc[idx] = first_author_yn 
            
            ### Publication
            df_applicants["Source\n(Journal)"].loc[idx] = publicationName.upper()
            
            
    except:
        pass
    
    # write in every step
    df_applicants["Notes"].loc[idx] = "\n".join(notes)
    df_applicants.to_excel(f"applicants_fill_from{start}_to{end}_ss.xlsx", index=False)

# working on 100th article....
