* code by Sihyun You (2021.12.28.)
* edit by Jehyun Lee (2021.12.30.)

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pybliometrics.scopus import ScopusSearch

In [17]:
def regularize_date_publication(_str):
    token_date = _str.split(' ')
    if (re.match('[0-9]', _str[0])):
        day, month, year = token_date[0], token_date[1][:3].upper(), token_date[2]
        s = ' '.join([day, month, year])
    else:
        month, year = token_date[0][:3].upper(), token_date[1]
        s = ' '.join([month, year])
        
    return (int(year), s)

def get_title_index(_pub_name, _df):
    list_title = _df['TITLE'].values
    for i, title in enumerate(list_title):
        if regularize(_pub_name) == regularize(title):
            return i
    return -1

def regularize(_str):
    return re.sub('[^A-Za-z0-9]+', '', re.sub('&', 'and', _str)).lower()       

In [3]:
pd.set_option("mode.chained_assignment", None)
pd.set_option("display.max_columns", None)
dfs_JCR_SCIE = []
YEAR_START, YEAR_REMARK, YEAR_END = 2016, 2019, 2020

In [4]:
for i in range(YEAR_START, YEAR_END+1):
    print(f"{i}년도 시트를 로딩중입니다.")
    dfs_JCR_SCIE.append(pd.read_excel(f"./data/JCR_SCIE_{i}.xlsx"))

2016년도 시트를 로딩중입니다.
2017년도 시트를 로딩중입니다.
2018년도 시트를 로딩중입니다.
2019년도 시트를 로딩중입니다.
2020년도 시트를 로딩중입니다.


In [5]:
df_applicants = pd.read_excel("applicants.xlsx", header=1)
df_applicants.head(3)

Unnamed: 0,순번,지원번호 내 순번,지원번호\nApplication No.,이름(국문),이름 (영문)\nName,논문명 (Title),게재일자\nPublication Date,학술지구분,저자구분,DOIs (Final),SCIE (Y/N),Publication Year,Publication Date,#citation,Publication Year journal impact factor,2019\njournal\nimpact\nfactor,2019 journal impact factor percentile,CNCI,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue
0,1,1,0026-000001,박성일,"Park, Seong-Il",Estimating blue carbon accumulated in a haloph...,2021.05.21,국외SCIE,주저자,10.1007/s11852-021-00828-1,Y,2021.0,JUN 2021,0.0,,1.374,43.457944,0.0,"Park, Seong-Il",Y,"Um, Jung-Sup",N,JOURNAL OF COASTAL CONSERVATION,25.0,3.0
1,2,3,0026-000003,김기덕,"Kim, Ki-Duk",Beneficial Roles of Carbon Black Additives in ...,2020.10.25,국외SCIE,주저자,10.1016/j.apcata.2020.117837,,,,,,,,,,,,,,,
2,3,7,0026-000009,김효원,"Kim, HyoWon",Stabilizing role of Mo in TiO2-MoOx supported ...,2020.08.14,국외SCIE,기타,10.1016/j.apcatb.2020.119433,,,,,,,,,,,,,,,


In [6]:
df_applicants = df_applicants.drop(["CNCI"], axis = 1)
list_name_en = df_applicants["이름 (영문)\nName"].values
list_doi = df_applicants["DOIs (Final)"].values

In [25]:
for i, doi in enumerate(list_doi):
    info_scopus = ScopusSearch(f"DOI ({doi})", download=True, subscriber=False).results[0]

    k2 = YEAR_REMARK - YEAR_START
    index_remark = get_title_index(info_scopus.publicationName, dfs_JCR_SCIE[k2])

    scie_yn = 'N'
    if index_remark > 0:
        scie_yn = 'Y'
    else:
        print("SCIE 논문이 아닙니다.")
        continue
    
    df_applicants["SCIE (Y/N)"][i] = scie_yn
    
    year, date = regularize_date_publication(info_scopus.coverDisplayDate)
    df_applicants["Publication Year"][i] = year
    df_applicants["Publication Date"][i] = date
    df_applicants["#citation"][i] = str(info_scopus.citedby_count)
    
    if year <= 2020:
        k1 = year - YEAR_START
        index_n = get_title_index(info_scopus.publicationName, dfs_JCR_SCIE[k1])
        jif_n = str(dfs_JCR_SCIE[k1]["IMPACT_FACTOR"][index_n])
    else:
        jif_n = ""
        
    df_applicants["Publication Year journal impact factor"][i] = jif_n
    df_applicants["2019\njournal\nimpact\nfactor"][i] = str(dfs_JCR_SCIE[k2]["IMPACT_FACTOR"][index_remark])
    df_applicants["2019 journal impact factor percentile"][i] = str(dfs_JCR_SCIE[k2]["JIF_PERCENTILE"][index_remark])
    
    
    try:
        author_cru = info_scopus.author_names.split(";")[0]
        print(info_scopus.author_names)
        print(author_cru)
        if regularize(author_cru) == regularize(list_name_en[i]):
            first_author = list_name_en[i]
            first_author_yn = "Y"
        else:
            first_author = author_cru
            first_author_yn = "N"
    except AttributeError as AE:
        print(f"#check : {info_scopus.doi}")

#check : 10.1007/s11852-021-00828-1
#check : 10.1016/j.apcata.2020.117837
#check : 10.1016/j.apcatb.2020.119433
#check : 10.1016/j.renene.2020.07.002
#check : 10.1016/j.coco.2020.100499
#check : 10.3390/en11020447
#check : 10.3390/en13174479
#check : 10.1016/j.cej.2021.130445
#check : 10.1088/0964-1726/25/4/045021
#check : 10.1021/acsami.7b11938


In [21]:
info_scopus

Document(eid='2-s2.0-85106932269', doi='10.1007/s11852-021-00828-1', pii=None, pubmed_id=None, title='Estimating blue carbon accumulated in a halophyte community using UAV imagery: a case study of the southern coastal wetlands in South Korea', subtype='ar', subtypeDescription='Article', creator='Park S.I.', afid=None, affilname='Kyungpook National University', affiliation_city='Daegu', affiliation_country='South Korea', author_count=None, author_names=None, author_ids=None, author_afids=None, coverDate='2021-06-01', coverDisplayDate='June 2021', publicationName='Journal of Coastal Conservation', issn='14000350', source_id='27372', eIssn='18747841', aggregationType='Journal', volume='25', issueIdentifier='3', article_number='38', pageRange=None, description=None, authkeywords=None, citedby_count='3', openaccess='0', fund_acr=None, fund_no=None, fund_sponsor=None)

In [26]:
len(list_doi)

10

In [15]:
df_applicants["Publication Year"][i]

2017.0

In [None]:
Document(eid='2-s2.0-85106932269', doi='10.1007/s11852-021-00828-1', pii=None, pubmed_id=None, title='Estimating blue carbon accumulated in a halophyte community using UAV imagery: a case study of the southern coastal wetlands in South Korea', subtype='ar', subtypeDescription='Article', creator='Park S.I.', afid=None, affilname='Kyungpook National University', affiliation_city='Daegu', affiliation_country='South Korea', author_count=None, author_names=None, author_ids=None, author_afids=None, coverDate='2021-06-01', coverDisplayDate='June 2021', publicationName='Journal of Coastal Conservation', issn='14000350', source_id='27372', eIssn='18747841', aggregationType='Journal', volume='25', issueIdentifier='3', article_number='38', pageRange=None, description=None, authkeywords=None, citedby_count='3', openaccess='0', fund_acr=None, fund_no=None, fund_sponsor=None)

In [23]:
info_scopus.author_names != None

False