* code by Sihyun You (2021.12.28.)
* edit by Jehyun Lee (2021.12.30.)

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pybliometrics.scopus import ScopusSearch
from bs4 import BeautifulSoup
import requests, json
from docx import Document
from docx.shared import Inches

In [2]:
from my_apikeys import APIKeys

In [3]:
# subscriber
try:
    s_sample = ScopusSearch(f"DOI (10.1038/s41598-021-83315-9)").results[0]
    subscriber=True
except:
    subscriber=False

print(f"subscriber={subscriber}")    

subscriber=True


In [4]:
def regularize_date_publication(_str):
    token_date = _str.split(' ')
    if (re.match('[0-9]', _str[0])):
        day, month, year = token_date[0], token_date[1][:3].upper(), token_date[2]
        s = ' '.join([day, month, year])
    else:
        month, year = token_date[0][:3].upper(), token_date[1]
        s = ' '.join([month, year])
        
    return (int(year), s)

def get_pub_index(_pub_name, _df):
    list_title = _df['TITLE'].map(regularize)
    pub_index = np.where(regularize(_pub_name) == list_title)[0]
    if len(pub_index) > 0:
        return pub_index
    return -1

def regularize(_str):
    return re.sub('[^A-Za-z0-9]+', '', re.sub('&', 'and', _str)).lower()       

In [5]:
pd.set_option("mode.chained_assignment", None)
pd.set_option("display.max_columns", None)
dfs_JCR_SCIE = {}
YEAR_START, YEAR_REMARK, YEAR_THIS = 2016, 2019, 2021

In [6]:
for i in range(YEAR_START, YEAR_THIS):
    print(f"{i}년도 시트를 로딩중입니다.")
    dfs_JCR_SCIE.update({str(i):pd.read_excel(f"./data/JCR_SCIE_{i}.xlsx")})

2016년도 시트를 로딩중입니다.
2017년도 시트를 로딩중입니다.
2018년도 시트를 로딩중입니다.
2019년도 시트를 로딩중입니다.
2020년도 시트를 로딩중입니다.


In [7]:
dfs_JCR_SCIE.keys()

dict_keys(['2016', '2017', '2018', '2019', '2020'])

In [8]:
df_applicants = pd.read_excel("applicants.xlsx", header=1)
df_applicants["h-index"] = [np.nan] * df_applicants.shape[0]
df_applicants["cited-by-count"] = [np.nan] * df_applicants.shape[0]
df_applicants["document-count"] = [np.nan] * df_applicants.shape[0]
df_applicants.head(3)

Unnamed: 0,순번,지원번호 내 순번,지원번호\nApplication No.,이름(국문),이름 (영문)\nName,논문명 (Title),게재일자\nPublication Date,학술지구분,저자구분,DOIs (Final),SCIE (Y/N),Publication Year,Publication Date,#citation,Publication Year journal impact factor,2019\njournal\nimpact\nfactor,2019 journal impact factor percentile,CNCI,1st Author,1ST AUTHOR\n(Y/N),Reprint Author,REPRINT AUTHOR\n(Y/N),Source\n(Journal),volume,issue,h-index,cited-by-count,document-count
0,1,1,0026-000001,박성일,"Park, Seong-Il",Estimating blue carbon accumulated in a haloph...,2021.05.21,국외SCIE,주저자,10.1007/s11852-021-00828-1,Y,2021.0,JUN 2021,0.0,,1.374,43.457944,0.0,"Park, Seong-Il",Y,"Um, Jung-Sup",N,JOURNAL OF COASTAL CONSERVATION,25.0,3.0,,,
1,2,3,0026-000003,김기덕,"Kim, Ki-Duk",Beneficial Roles of Carbon Black Additives in ...,2020.10.25,국외SCIE,주저자,10.1016/j.apcata.2020.117837,,,,,,,,,,,,,,,,,,
2,3,7,0026-000009,김효원,"Kim, HyoWon",Stabilizing role of Mo in TiO2-MoOx supported ...,2020.08.14,국외SCIE,기타,10.1016/j.apcatb.2020.119433,,,,,,,,,,,,,,,,,,


In [9]:
# df_applicants = df_applicants.drop(["CNCI"], axis = 1)
list_name_kr = df_applicants["이름(국문)"].values
list_name_en = df_applicants["이름 (영문)\nName"].values
list_doi = df_applicants["DOIs (Final)"].values

In [10]:
def get_scopus_info(SCOPUS_ID):
    url = ("http://api.elsevier.com/content/abstract/scopus_id/"
          + SCOPUS_ID
          + "?field=authors,title,publicationName,volume,issueIdentifier,"
          + "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType")
    resp = requests.get(url,
                    headers={'Accept':'application/json',
                             'X-ELS-APIKey': APIKeys[-3]})
    results = json.loads(resp.text.encode('utf-8'))
    
    authors = ', '.join([au['ce:indexed-name'] for au in results['abstracts-retrieval-response']['authors']['author']])
    title = results['abstracts-retrieval-response']['coredata']['dc:title']
    pubtype = results['abstracts-retrieval-response']['coredata']['prism:aggregationType']
    journal = results['abstracts-retrieval-response']['coredata']['prism:publicationName']
    volume = results['abstracts-retrieval-response']['coredata']['prism:volume']
    articlenum = (results['abstracts-retrieval-response']['coredata'].get('prism:pageRange') or
                  results['abstracts-retrieval-response']['coredata'].get('article-number'))
    date = results['abstracts-retrieval-response']['coredata']['prism:coverDate']
    doi = 'doi:' + results['abstracts-retrieval-response']['coredata']['prism:doi']
    cites = int(results['abstracts-retrieval-response']['coredata']['citedby-count'])
    
    return pubtype, f'{authors}, {title}, {journal}, {volume}, {articlenum}, ({date}). {doi} (cited {cites} times).\n'

In [11]:
xmls = []

for i, doi in enumerate(list_doi):
    s = ScopusSearch(f"DOI ({doi})", download=True, subscriber=subscriber).results
    if s == None:
        print(f"- No.{i} is invalid.")
        continue
        
    info_scopus = s[0]

    index_remark = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(YEAR_REMARK)])

    scie_yn = 'N'
    if len(index_remark) > 0:
        scie_yn = 'Y'
    else:
        print("SCIE 논문이 아닙니다.")
        continue
    
    df_applicants["SCIE (Y/N)"][i] = scie_yn
    
    year, date = regularize_date_publication(info_scopus.coverDisplayDate)
    df_applicants["Publication Year"][i] = year
    df_applicants["Publication Date"][i] = date
    df_applicants["#citation"][i] = str(info_scopus.citedby_count)
    
    if year < YEAR_THIS:
        index_n = get_pub_index(info_scopus.publicationName, dfs_JCR_SCIE[str(year)])[0]
        jif_n = str(dfs_JCR_SCIE[str(year)]["IMPACT_FACTOR"][index_n])
    else:
        jif_n = ""
        
    df_applicants["Publication Year journal impact factor"][i] = jif_n
    df_applicants["2019\njournal\nimpact\nfactor"][i] = str(dfs_JCR_SCIE[str(YEAR_REMARK)]["IMPACT_FACTOR"][index_remark[0]])
    df_applicants["2019 journal impact factor percentile"][i] = str(dfs_JCR_SCIE[str(YEAR_REMARK)].loc[index_remark, "JIF_PERCENTILE"].max())
    
    ### Author data
    authors_raw = np.array([regularize(n) for n in info_scopus.author_names.split(";")])

    # first author    
    if authors_raw[0] == regularize(list_name_en[i]):
        first_author = list_name_en[i]
        first_author_yn = "Y"
    else:
        first_author = authors_raw[0]
        first_author_yn = "N"    
    
    df_applicants["1st Author"][i] = first_author
    df_applicants["1ST AUTHOR\n(Y/N)"][i] = first_author_yn
    
    # h-index, citedby-count, document-count
    idx_author = np.where(regularize(list_name_en[i]) == authors_raw)[0][0]
    author_id = info_scopus.author_ids.split(";")[idx_author]
    
    author_r = requests.get(f"http://api.elsevier.com/content/author?author_id={author_id}&view=metrics",
                            headers={'Accept':'application/json', 'X-ELS-APIKey': APIKeys[-2]})
    author_data = eval(json.dumps(author_r.json(), sort_keys=True, indent=4, separators=(',', ': ')))
    h_index = author_data['author-retrieval-response'][0]["h-index"]
    citedby_count_total = author_data['author-retrieval-response'][0]["coredata"]['cited-by-count']
    document_count_total = author_data['author-retrieval-response'][0]["coredata"]['document-count']

    df_applicants["1st Author"][i] = first_author
    df_applicants["h-index"][i] = h_index
    df_applicants['cited-by-count'][i] = citedby_count_total
    df_applicants['document-count'][i] = document_count_total
    
    # Author's all publications
    author_pubs_ = requests.get(f"http://api.elsevier.com/content/search/scopus?query=AU-ID({author_id})&field=dc:identifier&count=100",
                               headers={'Accept':'application/json',
                                        'X-ELS-APIKey': APIKeys[-3]})

    author_pubs = author_pubs_.json()
    scopus_ids = [p['dc:identifier'] for p in author_pubs['search-results']["entry"]]
    
    pub_no = 1
    pubs = []
    for sid in scopus_ids:
        try:
            pub_type, pub_info = get_scopus_info(sid)
            if pub_type == 'Journal':
                pubs.append(f"{pub_info}")
                pub_no += 1
        except:
            pass
        
    # export to .docx file
    document = Document()
    document.add_heading(f"{i+1}. {list_name_kr[i]}: {list_name_en[i]} ({len(pubs)} publications)\n")
    
    for pub in pubs:
        document.add_paragraph(f'{pub}', style='List Number')
    document.save(f'./pubs/{i+1}_{list_name_en[i]}.docx')

    
    ### Publication
    df_applicants["Source\n(Journal)"][i] = info_scopus.publicationName.upper()
    df_applicants["volume"][i] = info_scopus.volume
    
    if info_scopus.issueIdentifier != None:
        issue = info_scopus.issueIdentifier
    else:
        issue = ''
    df_applicants["issue"][i] = issue
    
    # PDF download
    accept = "application/pdf"
    HEADERS = {
        'X-ELS-APIKEY': APIKeys[-1],
        'Accept': accept
    }
    url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
    with requests.get(url, stream=True, headers=HEADERS) as r:
        if r.status_code == 200: # download supported in Scopus
            if accept == "application/pdf":
                for chunk in r.iter_content(chunk_size=1024*1024):
                    with open(f"./pdf/paper_{i}.pdf", "wb") as f:
                        f.write(chunk)
            elif accept == "text/xml":
                xml = BeautifulSoup(r.content, "html5lib")
                xmls.append(xml)
                with open(f"./xml/xml_{i}.xml", "w") as f:
                    f.write(xml.prettify())

        elif r.status_code == 404: # not supported in Scopus
            print(f"# No.{i} is not available in Scopus. searching directly.")
            url = f"https://doi.org/{doi}"
            r = requests.get(url)
            xml = BeautifulSoup(r.content, "html5lib")
            xmls.append(xml)            
            with open(f"./xml/xml_{i}.xml", "w") as f:
                f.write(xml.prettify())

print(len(xmls))
df_applicants.to_excel("applicants_fill.xlsx", index=False)

# No.0 is not available in Scopus. searching directly.
# No.5 is not available in Scopus. searching directly.
# No.6 is not available in Scopus. searching directly.
# No.8 is not available in Scopus. searching directly.
# No.9 is not available in Scopus. searching directly.
5


In [12]:
scopus_ids

['SCOPUS_ID:85119584239',
 'SCOPUS_ID:85119823596',
 'SCOPUS_ID:85109217192',
 'SCOPUS_ID:85114871292',
 'SCOPUS_ID:85112323380',
 'SCOPUS_ID:85102940814',
 'SCOPUS_ID:85103781003',
 'SCOPUS_ID:85092139890',
 'SCOPUS_ID:85089195388',
 'SCOPUS_ID:85085263812',
 'SCOPUS_ID:85092185864',
 'SCOPUS_ID:85094908394',
 'SCOPUS_ID:85077599073',
 'SCOPUS_ID:85061842047',
 'SCOPUS_ID:85075267093',
 'SCOPUS_ID:85053061906',
 'SCOPUS_ID:85055054103',
 'SCOPUS_ID:85035014016',
 'SCOPUS_ID:85035083616',
 'SCOPUS_ID:85034636133',
 'SCOPUS_ID:84976877727',
 'SCOPUS_ID:84992212803',
 'SCOPUS_ID:84979235620',
 'SCOPUS_ID:84940378462',
 'SCOPUS_ID:84942880326',
 'SCOPUS_ID:84941942504',
 'SCOPUS_ID:84938418313',
 'SCOPUS_ID:84935009389',
 'SCOPUS_ID:84921800858',
 'SCOPUS_ID:84948760500',
 'SCOPUS_ID:84906063023',
 'SCOPUS_ID:84903592019',
 'SCOPUS_ID:68249124833',
 'SCOPUS_ID:38049113589']