In [8]:
from copy import deepcopy
import numpy as np
import pandas as pd
import calendar
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval

from tqdm import tqdm

pd.options.display.max_columns = 30

filename = "./210803/BIPV_ML.txt"
filename_rooftop = "./210803/rooftop_ML.txt"

In [9]:
df_bipv = pd.read_pickle("./210803/df_bipv.pkl")
df_bipv.head(3)

Unnamed: 0,eid,doi,pii,pubmed_id,title,subtype,subtypeDescription,creator,afid,affilname,affiliation_city,affiliation_country,author_count,author_names,author_ids,...,issn,source_id,eIssn,aggregationType,volume,issueIdentifier,article_number,pageRange,description,authkeywords,citedby_count,openaccess,fund_acr,fund_no,fund_sponsor
0,2-s2.0-85110764627,10.1016/j.renene.2021.07.075,S0960148121010806,,Markov chains estimation of the optimal period...,ar,Article,Sánchez-Barroso G.,60017838;126596986,Universidad de Extremadura;InnoDesarrollo S.L.,Badajoz;Badajoz,Spain;Spain,4,"Sánchez-Barroso, Gonzalo;González-Domínguez, J...",57216405703;57216406595;57197703890;57197479389,...,9601481.0,27569,18790682.0,Journal,179,,,537-549,The European Dehesa has a very high potential ...,Cleaning frequency | Cleaning photovoltaic sys...,0,1,ESF,PD18047,European Social Fund
1,2-s2.0-85110299293,10.1016/j.energy.2021.121415,S0360544221016637,,Stochastic modelling of variable renewables in...,ar,Article,Seljom P.,60004205;101126845,SINTEF Foundation for Scientific and Industria...,Trondheim;Kjeller,Norway;Norway,5,"Seljom, Pernille;Kvalbein, Lisa;Hellemo, Lars;...",37108388300;57226090492;26767755100;2043623340...,...,3605442.0,29348,,Journal,236,,121415.0,,Variable electricity generation from wind and ...,Energy-system modelling | Satellite-data | Sce...,0,0,,268097,Norges Forskningsråd
2,2-s2.0-85107321575,10.1038/s41467-021-23592-0,,34059672.0,Revealing composition and structure dependent ...,ar,Article,Lian W.,60089931;125236348,Hefei National Laboratory for Physical Science...,Hefei;Hefei,China;China,8,"Lian, Weitao;Jiang, Chenhui;Yin, Yiwei;Tang, R...",57204036372;57202454733;57203962991;5719647276...,...,,19700182758,20411723.0,Journal,12,1.0,3260.0,,Antimony trisulfide (Sb2S3) is a kind of emerg...,,0,1,IUSS,22005293,National Outstanding Youth Science Fund Projec...


In [12]:
df_bipv.loc[0]

eid                                                   2-s2.0-85110764627
doi                                         10.1016/j.renene.2021.07.075
pii                                                    S0960148121010806
pubmed_id                                                           None
title                  Markov chains estimation of the optimal period...
subtype                                                               ar
subtypeDescription                                               Article
creator                                               Sánchez-Barroso G.
afid                                                  60017838;126596986
affilname                 Universidad de Extremadura;InnoDesarrollo S.L.
affiliation_city                                         Badajoz;Badajoz
affiliation_country                                          Spain;Spain
author_count                                                           4
author_names           Sánchez-Barroso, Gonzalo;Gon

In [10]:
df_ab = pd.read_pickle("./210803/df_ab.pkl")
df_ab.shape

(14385, 29)

In [11]:
# publication type
# "J" = Journal
# "B" = Book
# "S" = Series
# "P" = Patent
dict_pubtype = {'ar': 'J',    # 'Article',
                'ab': 'J',    # 'Abstract Report',
                'bk': 'B',    # 'Book',
                'ch': 'B',    # 'Book Chapter',
                'bz': 'J',    # 'Business Article',
                'cp': 'J',    # 'Conference Paper',
                'cr': 'J',    # 'Conference Review',
                'dp': 'J',    # 'Data Paper ',
                'ed': 'S',    # 'Editorial',
                'er': 'J',    # 'Erratum',
                'le': 'J',    # 'Letter',
                'no': 'S',    # 'Note',
                'pr': 'S',    # 'Press Release',
                'rp': 'J',    # 'Report',
                'tb': 'J',    # 'Retracted',
                're': 'J',    # 'Review',
                'sh': 'J',    # 'Short Survey'
                'ip': 'P'     # 'Patent' 
               }

# documentation type
dict_docutype = {'ar': 'Article',
                 'ab': 'Abstract Report',
                 'bk': 'Book',
                 'ch': 'Book Chapter',
                 'bz': 'Business Article',
                 'cp': 'Conference Paper',
                 'cr': 'Conference Review',
                 'dp': 'Data Paper ',
                 'ed': 'Editorial',
                 'er': 'Erratum',
                 'le': 'Letter',
                 'no': 'Note',
                 'pr': 'Press Release',
                 'rp': 'Report',
                 'tb': 'Retracted',
                 're': 'Review',
                 'sh': 'Short Survey',
                 'ip': 'Patent'
                }

# text cleaning
dict_clean_text = {"&amp;": "&", 
                   "&nbsp;": " ", 
                   "&lt;": "<",
                   "&gt;": ">"
                  }
def get_clean_text(text):
    for k, v in dict_clean_text.items():
        text = text.replace(k, v)
    return text

In [13]:
# all

import os, calendar

with open(filename, "w") as datafile:
    datafile.write("FN Clarivate Analytics Web of Science\nVR 1.0")
    
    for art in df_ab.index:
        df_row = df_ab.loc[art]
        
        # 1. PT: publication type
        pubtype_ = dict_pubtype[df_row['PT']]
        datafile.write(f"\nPT {pubtype_}")
        
        # 2. AU: author names
        author_name_ = df_row['AU']
        author_name = "\nAU "
        if all(author_name_):
            author_name += "\n   ".join(author_name_)
            
        datafile.write(author_name)
        
        # 3. AF: affiliations
        affiliation_ = df_row['AF']
        affiliation = "\nAF "
        if all(affiliation_):
            affiliation += "\n   ".join(affiliation_)
        datafile.write(affiliation)
        
        # 4. TI: document title
        title = '\nTI '
        title_ = df_row['TI']
        if title_:
            title += get_clean_text(title_)
        datafile.write(title)
        
        # 5. SO: publication name
        so = '\nSO '
        so_ = df_row['SO']
        if so_:
            so += get_clean_text(so_)
        datafile.write(so)
        
        # 6. LA : Language
        datafile.write("\nLA " + df_row["LA"])
        
        # 7. DT : Document Type
        docutype_ = dict_docutype[df_row['DT']]
        datafile.write(f"\nDT {docutype_}")
        
        # 8. DE : Author Keywords
        de_ = df_row['DE']
        datafile.write(f"\nDE {de_}")
        
        # 9. ID : Keyword Plus
        id_ = df_row['ID']
        datafile.write(f"\nID {id_}")
        
        # 10. AB: Abstract
        ab_ = df_row['AB']
        datafile.write(f"\nAB {ab_}")
        
        # 11. C1 : Author Address
        c1_ = df_row['C1']
        c1 = [f"[{'; '.join(c[1])}] {', '.join(c[2].split(', ')[1:])}" for c in c1_]
        c1 = ".\n   ".join(c1) + "."
        datafile.write("\nC1 " + c1)
        
        # 12. RP : Reprint Address
        datafile.write("\nRP None")

        # 13. EM : E-mail Address
        datafile.write("\nEM None")
        
        # 14. CR : Cited References
        cr_ = df_row["CR"]
        if len(cr_) > 0:
            cr_[0] = "  " + cr_[0][2:]
            cr_ = [c for c in cr_ if len(c) > 5]
            cr_ = [c.lstrip(" ") for c in cr_ if (c[3] != ',') or ("DOI" in c)]
        datafile.write("\nCR "+"\n   ".join(cr_))
        
        # 15. NR : Cited Reference Count
        datafile.write(f"\nNR {df_row['NR']}")
        
        # 16. TC : Web of Science Core Collection Times Cited Count
        datafile.write(f"\nTC {df_row['TC']}")
        
        # 17. Z9 : Total Times Cited Count
        z9_ = df_row['Z9']
        datafile.write(f"\nZ9 {z9_}")
        
        # 18. U1 : Usage Count (Last 180 Days)
        # 19. U2 : Usage Count (Since 2013)
        # 20. PU : Publisher = ELSEVIER SCI LTD
        # 21. PI : Publisher City = OXFORD
        # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND
        # 23. SN : International Standard Serial Number (ISSN) = 0959-6526
        sn_ = df_row["SN"]
        datafile.write(f"\nSN {sn_}")
        
        # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786
        # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD
        j9_ = df_row["J9"]
        datafile.write(f"\nJ9 {j9_}")
        
        # 26. JI : ISO Source Abbreviation = J. Clean Prod.
        ji_ = df_row["JI"]
        datafile.write(f"\nJI {ji_}")
        
        # 27. PD : Publication Date = JUL 1
        pd_ = df_row["PD"]
        datafile.write(f"\nPD {pd_}")
        
        # 28. PY : Publication Year = 2020
        py_ = df_row["PY"]
        datafile.write(f"\nPY {py_}")
        
        # 29. VL : Volumn = 260
        vl_ = df_row["VL"]
        datafile.write(f"\nVL {vl_}")
        
        # 30. AR : Article Number = 121059
        ar_ = df_row["AR"]
        datafile.write(f"\nAR {ar_}")
        
        # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059
        doi_ = df_row["DI"]
        datafile.write(f"\nDI {doi_}")
        
        # 32. PG : Page Count = 14
        # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences
        # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology
        sc_ = df_row["SC"]
        datafile.write(f"\nSC {sc_}")
        
        # end 
        datafile.write(f"\nER\n")
        
    datafile.write("\nEF\n")

In [15]:
df_ab.query("is_rooftop == True")

Unnamed: 0,eid,PT,AU,AF,TI,SO,SO_abb,LA,DT,DE,ID,AB,C1,RP,EM,CR,NR,TC,Z9,SN,J9,JI,PD,PY,VL,AR,DI,SC,is_rooftop
0,2-s2.0-85110764627,ar,"[Sanchez-Barroso G., Gonzalez-Dominguez J., Ga...","[Sánchez-Barroso, Gonzalo, González-Domínguez,...",Markov chains estimation of the optimal period...,Renewable Energy,Renew. Energy,English,ar,Cleaning frequency; Cleaning photovoltaic syst...,,© 2021 The AuthorsThe European Dehesa has a ve...,"[[[57216405703, 57216406595, 57197703890], [Sa...",,,"[ , 2012, , 2020, Spain's Electricity Gri...",44,0,0,18790682 09601481,RENEW. ENERGY,Renew. Energy,DEC 1,01,179,,10.1016/j.renene.2021.07.075,"Renewable Energy, Sustainability and the Envir...",True
3,2-s2.0-85107176116,ar,"[Perrakis G., Tasolamprou A.C., Kenanakis G., ...","[Perrakis, George, Tasolamprou, Anna C., Kenan...",Combined nano and micro structuring for enhanc...,Scientific Reports,Sci. Rep.,English,ar,,,"© 2021, The Author(s).Outdoor devices comprisi...","[[[25925551900, 21234204900], [Tasolamprou A.C...",,,"[ Raman, A.P.; Anoma, M.A.; Zhu, L.; Rephael...",45,0,0,20452322,SCI. REP.,Sci. Rep.,DEC 1,01,11,11552,10.1038/s41598-021-91061-1,Multidisciplinary,True
34,2-s2.0-85108691578,ar,"[Christiaanse T.V., Loonen R.C.G.M., Evins R.]","[Christiaanse, T. V., Loonen, R. C.G.M., Evins...",Techno-economic optimization for grid-friendly...,Sustainable Energy Technologies and Assessments,Sustainable Energy Technol. Assess.,English,ar,Bi-level optimization; British Columbia; Energ...,,© 2021 Elsevier LtdHigh levels of non-dispatch...,"[[[55504415000], [Loonen R.C.G.M.], 60032882, ...",,,"[ Wiginton, L.K.; Nguyen, H.T.; Pearce, J.M....",36,0,0,22131388,SUSTAINABLE ENERGY TECHNOL. ASSESS.,Sustainable Energy Technol. Assess.,OCT 1,01,47,101320,10.1016/j.seta.2021.101320,"Renewable Energy, Sustainability and the Envir...",True
83,2-s2.0-85107622660,ar,"[Hong Y., Ezeh C.I., Deng W., Hong S.-H., Ma Y...","[Hong, Y., Ezeh, Collins I., Deng, W., Hong, S...",Coordinated energy-environmental-economic opti...,Energy Conversion and Management,Energy Convers. Manage.,English,ar,Life-cycle cost analysis; Low-rise; Office bui...,,© 2021 Elsevier LtdGiven that energy-efficienc...,"[[[57191907391, 56647024700, 57222633102], [De...",,,"[ Nägeli, C., 2019, Build Environ, Zheng,...",40,0,0,01968904,ENERGY CONVERS. MANAGE.,Energy Convers. Manage.,SEP 1,01,243,114327,10.1016/j.enconman.2021.114327,"Renewable Energy, Sustainability and the Envir...",True
89,2-s2.0-85105000579,ar,"[Rahmani F., Robinson M.A., Barzegaran M.R.]","[Rahmani, Fatemeh, Robinson, Mark Alan, Barzeg...",Cool roof coating impact on roof-mounted photo...,International Journal of Electrical Power and ...,Int J Electr Power Energy Syst,English,ar,Cool roof coating; DC-coupled solar photovolta...,,© 2021 Elsevier LtdCool-roofing is effective i...,"[[[57198277655, 26535502400], [Rahmani F., Bar...",,,"[ Akbari, H.; Levinson, R., 2008, Adv Build ...",27,0,0,01420615,INT J ELECTR POWER ENERGY SYST,Int J Electr Power Energy Syst,SEP 1,01,130,106932,10.1016/j.ijepes.2021.106932,Energy Engineering and Power Technology; Elect...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14192,2-s2.0-0023545718,ar,[Muneer T.],"[Muneer, T.]",Hourly diffuse and global solar irradiation: F...,Building Services Engineering Research & Techn...,Build Serv Eng Res Technol,English,ar,,,Following the correlation equations between ho...,"[[[7006647735], [Muneer T.], 60011885, Robert ...",,,"[ Muneer, T.; Saluja, G.S., 1986, Building S...",8,9,9,14770849 01436244,BUILD SERV ENG RES TECHNOL,Build Serv Eng Res Technol,JAN 1,01,8,,10.1177/014362448700800403,Building and Construction,True
14197,2-s2.0-0023382414,ar,"[Lindquist S.-E., Lindgren A., Leygraf C.]","[Lindquist, Sten Eric, Lindgren, Anders, Leygr...",Effects of electrochemical reduction of polycr...,Solar Energy Materials,,English,ar,,,Polycrystalline TiO2 film electrodes prepared ...,"[[[7004947028], [Leygraf C.], 60079613, Swerea...",,,"[ , 1979, Journal of The Electrochemical Soc...",24,21,21,01651633,NONE,,JAN 1,01,15,,10.1016/0165-1633(87)90057-8,Engineering (all),True
14226,2-s2.0-0022329766,cp,[Ingersoll John G.],"[Ingersoll, John G.]",SIMPLIFIED PREDICTION OF ANNUAL ENERGY GENERAT...,Conference Record of the IEEE Photovoltaic Spe...,Conference Record of the IEEE Photovoltaic Spe...,English,cp,,,A simplified but accurate model has been devel...,"[[[57197097430], [Ingersoll John G.], 60005304...",,,[],12,4,4,01608371,CONFERENCE RECORD OF THE IEEE PHOTOVOLTAIC SPE...,Conference Record of the IEEE Photovoltaic Spe...,DEC 1,01,,,,Control and Systems Engineering; Industrial an...,True
14288,2-s2.0-0021774758,cp,"[Girgis Magdy A., Chandra Subrato, Khattar Muk...","[Girgis, Magdy A., Chandra, Subrato, Khattar, ...",THERMAL PERFORMANCE OF THE FSEC PV HOUSE UNDER...,,,English,cp,,,An energy efficient photovoltaic (PV) house ha...,"[[[7006352007, 7401671386, 7004438367], [Girgi...",,,[],4,0,0,,NONE,,JAN 1,01,,,,Engineering (all),True


In [14]:
# rooftop

import os, calendar

with open(filename_rooftop, "w") as datafile:
    datafile.write("FN Clarivate Analytics Web of Science\nVR 1.0")
    
    df_ab_rooftop = df_ab.query("is_rooftop == True")
    for art in df_ab_rooftop.index:
        df_row = df_ab.loc[art]
        
        # 1. PT: publication type
        pubtype_ = dict_pubtype[df_row['PT']]
        datafile.write(f"\nPT {pubtype_}")
        
        # 2. AU: author names
        author_name_ = df_row['AU']
        author_name = "\nAU "
        if all(author_name_):
            author_name += "\n   ".join(author_name_)
            
        datafile.write(author_name)
        
        # 3. AF: affiliations
        affiliation_ = df_row['AF']
        affiliation = "\nAF "
        if all(affiliation_):
            affiliation += "\n   ".join(affiliation_)
        datafile.write(affiliation)
        
        # 4. TI: document title
        title = '\nTI '
        title_ = df_row['TI']
        if title_:
            title += get_clean_text(title_)
        datafile.write(title)
        
        # 5. SO: publication name
        so = '\nSO '
        so_ = df_row['SO']
        if so_:
            so += get_clean_text(so_)
        datafile.write(so)
        
        # 6. LA : Language
        datafile.write("\nLA " + df_row["LA"])
        
        # 7. DT : Document Type
        docutype_ = dict_docutype[df_row['DT']]
        datafile.write(f"\nDT {docutype_}")
        
        # 8. DE : Author Keywords
        de_ = df_row['DE']
        datafile.write(f"\nDE {de_}")
        
        # 9. ID : Keyword Plus
        id_ = df_row['ID']
        datafile.write(f"\nID {id_}")
        
        # 10. AB: Abstract
        ab_ = df_row['AB']
        datafile.write(f"\nAB {ab_}")
        
        # 11. C1 : Author Address
        c1_ = df_row['C1']
        c1 = [f"[{'; '.join(c[1])}] {', '.join(c[2].split(', ')[1:])}" for c in c1_]
        c1 = ".\n   ".join(c1) + "."
        datafile.write("\nC1 " + c1)
        
        # 12. RP : Reprint Address
        datafile.write("\nRP None")

        # 13. EM : E-mail Address
        datafile.write("\nEM None")
        
        # 14. CR : Cited References
        cr_ = df_row["CR"]
        if len(cr_) > 0:
            cr_[0] = "  " + cr_[0][2:]
            cr_ = [c for c in cr_ if len(c) > 5]
            cr_ = [c.lstrip(" ") for c in cr_ if (c[3] != ',') or ("DOI" in c)]
        datafile.write("\nCR "+"\n   ".join(cr_))
        
        # 15. NR : Cited Reference Count
        datafile.write(f"\nNR {df_row['NR']}")
        
        # 16. TC : Web of Science Core Collection Times Cited Count
        datafile.write(f"\nTC {df_row['TC']}")
        
        # 17. Z9 : Total Times Cited Count
        z9_ = df_row['Z9']
        datafile.write(f"\nZ9 {z9_}")
        
        # 18. U1 : Usage Count (Last 180 Days)
        # 19. U2 : Usage Count (Since 2013)
        # 20. PU : Publisher = ELSEVIER SCI LTD
        # 21. PI : Publisher City = OXFORD
        # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND
        # 23. SN : International Standard Serial Number (ISSN) = 0959-6526
        sn_ = df_row["SN"]
        datafile.write(f"\nSN {sn_}")
        
        # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786
        # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD
        j9_ = df_row["J9"]
        datafile.write(f"\nJ9 {j9_}")
        
        # 26. JI : ISO Source Abbreviation = J. Clean Prod.
        ji_ = df_row["JI"]
        datafile.write(f"\nJI {ji_}")
        
        # 27. PD : Publication Date = JUL 1
        pd_ = df_row["PD"]
        datafile.write(f"\nPD {pd_}")
        
        # 28. PY : Publication Year = 2020
        py_ = df_row["PY"]
        datafile.write(f"\nPY {py_}")
        
        # 29. VL : Volumn = 260
        vl_ = df_row["VL"]
        datafile.write(f"\nVL {vl_}")
        
        # 30. AR : Article Number = 121059
        ar_ = df_row["AR"]
        datafile.write(f"\nAR {ar_}")
        
        # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059
        doi_ = df_row["DI"]
        datafile.write(f"\nDI {doi_}")
        
        # 32. PG : Page Count = 14
        # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences
        # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology
        sc_ = df_row["SC"]
        datafile.write(f"\nSC {sc_}")
        
        # end 
        datafile.write(f"\nER\n")
        
    datafile.write("\nEF\n")