In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
import calendar
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval

from tqdm import tqdm

pd.options.display.max_columns = 30

filename = "./energy_factory.txt"

In [2]:
### language
dic_language = {'eng': "English",
                'kor': "Korean"
               }


In [3]:
df_art = pd.read_pickle("./df_factory.pkl")
print(df_art.shape)
df_art.head()

(1017, 34)


Unnamed: 0,eid,doi,pii,pubmed_id,title,subtype,subtypeDescription,creator,afid,affilname,affiliation_city,affiliation_country,author_count,author_names,author_ids,...,issn,source_id,eIssn,aggregationType,volume,issueIdentifier,article_number,pageRange,description,authkeywords,citedby_count,openaccess,fund_acr,fund_no,fund_sponsor
0,2-s2.0-85111496548,10.1186/s12934-021-01639-7,,,Enforcing ATP hydrolysis enhanced anaerobic gl...,ar,Article,Dai Z.,60027363;60021016;60019499;60012932,University of Chinese Academy of Sciences;Inst...,Beijing;Beijing;Beijing;Clayton,China;China;China;Australia,6,"Dai, Zongjie;Zhu, Yan;Dong, Hongjun;Zhao, Chun...",36089670400;57189383541;35603311500;5695965610...,...,,18593,14752859.0,Journal,20,1.0,149,,Background: The intracellular ATP level is an ...,ABE fermentation | Acidogenesis | Anaerobic fe...,0,1,NSFC,31670048,National Natural Science Foundation of China
1,2-s2.0-85110191717,10.1016/j.jobe.2021.102955,S2352710221008135,,Seismic fragility analysis of steel intermedia...,ar,Article,Yu E.,60024872;60012291;60000872,Hanyang University;Incheon National University...,Seoul;Incheon;Chuncheon,South Korea;South Korea;South Korea,3,"Yu, Eunjong;Kim, Taewan;Park, Ji Hun",8639910700;57194881369;32867930500,...,,21100389518,23527102.0,Journal,44,,102955,,Steel intermediate moment frames (SIMFs) with ...,Beam splice slip | Collapse probability | Colu...,0,0,,undefined,
2,2-s2.0-85101125317,10.1038/s41598-020-80217-0,,33574347.0,Bond order redefinition needed to reduce inher...,ar,Article,Syuhada I.,60069382,Institut Teknologi Bandung,Bandung,Indonesia,6,"Syuhada, Ibnu;Hauwali, Nikodemus Umbu Janga;Ro...",57189732352;57211662136;57189732507;3616070250...,...,,21100200805,20452322.0,Journal,11,1.0,3674,,"In this work, we present the bond order redefi...",,0,1,,undefined,
3,2-s2.0-85110536409,10.1016/j.enbuild.2021.111192,S037877882100476X,,Validation of a building energy model of a hyd...,ar,Article,Liebman-Pelaez M.,60022195,Massachusetts Institute of Technology,Cambridge,United States,4,"Liebman-Pelaez, Mariana;Kongoletos, Johnathan;...",57226110657;57225274529;7004632109;7004974696,...,3787788.0,29359,,Journal,250,,111192,,Plant factories have developed within urban co...,Building energy model | Energy use | Energy va...,0,0,MIT,undefined,Massachusetts Institute of Technology
4,2-s2.0-85109095996,10.1016/j.energy.2021.121186,S0360544221014341,,Plate heat exchanger design for the utilisatio...,ar,Article,Arsenyeva O.,60108596;60104464;60020130;123624719,"Brno University of Technology, Faculty of Mech...",Brno;Kharkiv;Kharkiv;Kharkiv,Czech Republic;Ukraine;Ukraine;Ukraine,6,"Arsenyeva, Olga;Klemeš, Jiří Jaromír;Kapustenk...",22033364500;56903012000;57202785267;5722311165...,...,3605442.0,29348,,Journal,233,,121186,,The vapour condensation is typical for process...,Condensing in two-phase flow | Plate heat exch...,0,0,EC,CZ.02.1.01/0.0/0.0/15_003/0000456,European Commission


In [4]:
columns=["eid", "PT", "AU", "AF", "TI", 
         "SO", "SO_abb", "LA", "DT", "DE", 
         "ID", "AB", "C1", "RP", "EM", 
         "CR", "NR", "TC", "Z9", "SN", 
         "J9", "JI", "PD", "PY", "VL", 
         "AR", "DI", "SC"]

df_ab = pd.DataFrame(columns=columns)

In [7]:
starting_index = 0
data_index = list(range(starting_index, df_art.shape[0]))

for art in tqdm(data_index, desc="looping over df_art"):

    eid = df_art.loc[art, "eid"]
    try:
        ab = AbstractRetrieval(eid, view="FULL")
    except:
        print(f"# {art}: abstract retrieval error")
        continue

    # 1. PT: publication type
    docu_type_ = ab.srctype

    # 2. AU: index name
    index_name_ = [a.indexed_name for a in ab.authors] if ab.authors else [None]

    # 3. AF: author name
    author_name_ = [f"{a.surname}, {a.given_name}" for a in ab.authors] if ab.authors else [None]

    # 4. TI: document title
    docu_title_ = df_art.loc[art, "title"]

    # 5. SO: publication name
    src_title_ = ab.publicationName

    # 5-1. publication abbr.
    src_abb_ = ab.sourcetitle_abbreviation

    # 6. LA : Language
    try:
        language_ = dic_language[ab.language]
    except:
        language_ = "unknown"

    # 7. DT : Document Type
    docu_type_ = ab.subtype

    # 8. DE : Author Keywords
    auth_kw_ = ab.authkeywords

    if not auth_kw_:
        auth_kw_ = 'None'
    else:
        auth_kw_ = '; '.join(auth_kw_)

    # 9. ID : Keyword Plus
    kw_plus_ = 'None'

    # 10. AB : Abstract
    abstract_ = ab.abstract

    # 11. C1 : Author Address
#     if ab.authorgroup:
#         tmp = pd.DataFrame(ab.authorgroup)
#         grouped = tmp.groupby('organization').agg(lambda x: list(x))

#         aff_names = [str(aff) for aff in grouped["affiliation_id"].index.tolist()]
#         aff_ids = grouped["affiliation_id"].tolist()
#         citys = grouped["city"].tolist()
#         countrys = grouped["country"].tolist()
#         auids = grouped["auid"].tolist()
#         indexed_names = grouped["indexed_name"].tolist()

#         address = []
#         for aff_name, aff_id, city, country in zip(aff_names, aff_ids, citys, countrys):
#             if isinstance(aff_id, list):
#                 aff_id, city, country = aff_id[0], city[0], country[0]
#             address.append(f"{aff_id}, {aff_name}, {city}, {country}")

#         addresss_ = list(zip(auids, indexed_names, address))
    if ab.affiliation and ab.authors:
        df_aff = pd.DataFrame(ab.affiliation)
        df_aff["id"] = df_aff["id"].astype(str)
        df_auth = pd.DataFrame(ab.authors)
        df_authgroup = df_auth.groupby("affiliation").agg(list).reset_index()

        df_aff = pd.merge(df_aff, df_authgroup, left_on="id", right_on="affiliation").drop(["affiliation"], axis=1)
        df_aff["address"] = df_aff[['id', 'name', 'city', 'country']].apply(lambda x: ', '.join(x.astype(str)), axis=1)
        addresss_ = df_aff[["auid", "indexed_name", "address"]].values.tolist()
    else:
        addresss_ = []

    # df_aff = pd.DataFrame(ab.authorgroup)
    # grouped = df_aff.groupby("affiliation_id")
    # grouped[["organization", "affiliation_id", "city", "country"]]

    # 12. RP : Reprint Address
    rep_addr_ = "None"

    # 13. EM : E-mail Address
    em_addr_ = "None"

    # 14. CR : Cited References
    refs_ = []
    if ab.references != None:
        tmp = pd.DataFrame(ab.references)
        refcount = int(ab.refcount)

        for i in range(refcount):
            tmp_ = tmp.iloc[i]
            tmp_authors = tmp_['authors']
            if tmp_authors == None:
                tmp_authors = "[Anonymous]"
            tmp_year = tmp_['publicationyear']
            tmp_src = tmp_['sourcetitle']
            tmp_vol = tmp_['volume']
            tmp_page = tmp_['first']
            tmp_doi = tmp_['doi']

            ref = tmp_authors
            for item in [tmp_year, tmp_src, tmp_vol, tmp_page]:
                if item != None:
                    ref = ', '.join([ref, item])
            if tmp_doi != None:
                ref = ref + f", DOI {tmp_doi}"

            if i == 0:
                refs_.append(f"CR {ref}")
            else:
                refs_.append(f"   {ref}")

    # 15. NR : Cited Reference Count
    nr_ = ab.refcount

    # 16. TC : Web of Science Core Collection Times Cited Count
    tc_ = ab.citedby_count

    # 17. Z9 : Total Times Cited Count
    cc_ = tc_

    # 18. U1 : Usage Count (Last 180 Days)
    # 19. U2 : Usage Count (Since 2013)
    # 20. PU : Publisher = ELSEVIER SCI LTD
    # 21. PI : Publisher City = OXFORD
    # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND
    # 23. SN : International Standard Serial Number (ISSN) = 0959-6526
    sn_ = "None" if ab.issn == None else ab.issn

    # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786
    # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD
    j9_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else "None"
    j9_ = j9_.upper()

    # 26. JI : ISO Source Abbreviation = J. Clean Prod.
    ji_ = ab.sourcetitle_abbreviation if ab.sourcetitle_abbreviation != None else "None"

    # 27. PD : Publication Date = JUL 1
    month = ab.coverDate.split('-')[1]
    date = ab.coverDate.split('-')[2]
    pd_ = f"{calendar.month_name[int(month)][:3].upper()} {int(date)}"

    # 28. PY : Publication Year = 2020
    py_ = ab.coverDate.split('-')[0]

    # 29. VL : Volumn = 260
    vl_ = ab.volume

    # 30. AR : Article Number = 121059
    ar_ = df_art.loc[art, "article_number"] if df_art.loc[art, "article_number"] != None else "None"

    # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059
    doi_ = ab.doi

    # 32. PG : Page Count = 14
    # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences
    # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology
    if ab.subject_areas:        
        tmp = pd.DataFrame(ab.subject_areas)
        tmp_ = tmp["area"].tolist()
        sc_ = "; ".join(tmp_)
    else: 
        sc_ = [None]

    # 35. GA : Document Delivery Number = LL4XH
    # 36. UT : Accession Number = WOS:000531559900003
    # 37. DA : Date this report was generated. = 2020-06-14
    
    # summation
    data=[eid, docu_type_, index_name_, author_name_, docu_title_, 
                                src_title_, src_abb_, language_, docu_type_, auth_kw_, 
                                kw_plus_, abstract_, addresss_, rep_addr_, em_addr_, 
                                refs_, nr_, tc_, cc_, sn_, 
                                j9_, ji_, pd_, py_, vl_, 
                                ar_, doi_, sc_]
    
    df_ab = df_ab.append(dict(zip(columns, data)), ignore_index=True)

looping over df_art: 100%|██████████| 1017/1017 [09:22<00:00,  1.81it/s]


In [8]:
df_ab.to_pickle("./df_ab.pkl")
df_ab.shape

(1017, 28)