# 1. Keyword cleansing from Web of Science search result
* search result = `keywords` + `abstract`

## 1. Preparation
### 1.1. Import Libraries

In [16]:
from copy import deepcopy
import numpy as np
import pandas as pd
import json, re, nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

import itertools

from nltk.stem.wordnet import WordNetLemmatizer
Lem = WordNetLemmatizer()

In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("talk")

### 1.2. filename settings

In [18]:
scopus = pd.read_csv("df_scopus.csv")
keyword_org_name = "keywords.txt"
keyword_abb_name = "keywords_abb.json"
keyword_dash_name = "keywords_dash.json"
keyword_dict_name = "keywords_dict.json"
keyword_single_name = "keywords_single.txt"
keyword_plural_name = "keywords_plural.json"
unicode_name = "unicode_json"

## 2. Data Crawling

### 2.1. Search in Scopus
* receive `scopus.csv` via email

### 2.2. Data Retrieval from scopus

In [23]:
from pybliometrics.scopus import AbstractRetrieval
scopus = pd.read_csv("df_scopus.csv")

# e1fc56d1d9fcc86e6998a7fd79faed23
# dcbed1c3abb36a8e80addbf185f38671
# 78d8e510bda00c5530f560f02bc051bc

In [315]:
import os, calendar

mode = "a" if os.path.exists(filename) else "w"

if mode == "w":
    with open(filename, mode) as datafile:
        datafile.write("FN Clarivate Analytics Web of Science\nVR 1.0\n")

In [24]:
filename = "./BIPV_ML_all.txt"
# artno = 0

### document type
# "J" = Journal
# "B" = Book
# "S" = Series
# "P" = Patent
dic_docu_type = {'Article': "J", 
                 'Review': "J", 
                 'Conference Paper': "J",
                 'Conference Review': "J",
                 'Book Chapter': "B",
                 'Erratum': "J", 
                 'Letter': "J",
                 '[No source information available]': "J", 
                 'Note': "J", 
                 'Short Survey': "J",
                 'Book': "B", 
                 'Retracted': "B", 
                 'Editorial': "S"}

### language
dic_language = {'eng': "English",
                'kor': "Korean"
               }


In [25]:
# this cell dies when the number of retieved paper > 10000/week

for artno in range(scopus.shape[0]):
    ab = AbstractRetrieval(scopus["EID"].iloc[artno], view="FULL")
    
    if scopus["Authors"].iloc[artno] != '[No author name available]':   
        mode = "a" if os.path.exists(filename) else "w"
        with open(filename, mode) as datafile:
            # 1. PT: publication type
            docu_type_ = scopus["Document Type"].iloc[artno]
            try:
                docu_type_0 = dic_docu_type[docu_type_]
            except:
                docu_type_0 = "unknown"
            datafile.write(f"PT {docu_type_0}\n")

            # 2. AU: author names
            indexed_name_ = pd.DataFrame(ab.authors)['indexed_name'].tolist()    
            datafile.write(f"AU {indexed_name_[0]}\n")
            for i in range(1, len(indexed_name_)):
                datafile.write(f"   {indexed_name_[i]}\n")

            # 3. AF: author names, full
            full_name_ = pd.DataFrame(ab.authors)[["surname", "given_name"]].apply(lambda s: f"{s[0]}, {s[1]}", axis=1).tolist()
            datafile.write(f"AF {full_name_[0]}\n")
            for i in range(1, len(full_name_)):
                datafile.write(f"   {full_name_[i]}\n")

            # 4. TI: document title
            docu_title_ = scopus["Title"].iloc[artno]
            datafile.write(f"TI {docu_title_}\n")

            # 5. SO: publication name
            src_title_ = scopus["Source title"].iloc[artno]
            datafile.write(f"SO {src_title_}\n")

            # 6. LA : Language
            try:
                language_ = dic_language[ab.language]
            except:
                language_ = "unknown"
            datafile.write(f"LA {language_}\n")

            # 7. DT : Document Type
            docu_type_ = scopus["Document Type"].iloc[artno]
            datafile.write(f"DT {docu_type_}\n")

            # 8. DE : Author Keywords
            auth_kw_ = ab.authkeywords
            if auth_kw_ == None:
                datafile.write("DE None\n")
            else:
                datafile.write(f"DE {'; '.join(auth_kw_)}\n")

            # 9. ID : Keyword Plus
            datafile.write("ID None\n")

            # 10. AB : Abstract
            datafile.write(f"AB {ab.abstract}\n")

            # 11. C1 : Author Address
            tmp = pd.DataFrame(ab.authorgroup)
            grouped = tmp.groupby('organization')
            aff_ids = tmp["organization"].unique()

            if len(aff_ids) > 0:
                for i, aff_id in enumerate(aff_ids):
                    if aff_id != None:
                        group = grouped.get_group(aff_id)
                        names = group[["surname", "given_name"]].apply(lambda s: f"{s[0]}, {s[1]}", axis=1).tolist()
                        aff = group[["organization", "city", "postalcode", "addresspart", "country"]].apply(lambda s: f"{s[0]}, {s[1]}, {s[2]}, {s[3]}, {s[4]}", axis=1).iloc[0]
                        if i == 0:
                            datafile.write('C1 [' + '; '.join(names) + f'] {aff}\n')
                        else:
                            datafile.write('   [' + '; '.join(names) + f'] {aff}\n')

            # 12. RP : Reprint Address
            datafile.write(f"RP None\n")

            # 13. EM : E-mail Address
            datafile.write(f"EM None\n")

            # 14. CR : Cited References
            if ab.references != None:
                tmp = pd.DataFrame(ab.references)
                refcount = int(ab.refcount)

                for i in range(refcount):
                    tmp_ = tmp.iloc[i]
                    tmp_authors = tmp_['authors']
                    if tmp_authors == None:
                        tmp_authors = "[Anonymous]"
                    tmp_year = tmp_['publicationyear']
                    tmp_src = tmp_['sourcetitle']
                    tmp_vol = tmp_['volume']
                    tmp_page = tmp_['first']
                    tmp_doi = tmp_['doi']

                    ref = tmp_authors
                    for item in [tmp_year, tmp_src, tmp_vol, tmp_page]:
                        if item != None:
                            ref = ', '.join([ref, item])
                    if tmp_doi != None:
                        ref = ref + f", DOI {tmp_doi}"

                    if i == 0:
                        datafile.write(f"CR {ref}\n")
                    else:
                        datafile.write(f"   {ref}\n")

            # 15. NR : Cited Reference Count
            datafile.write(f"NR {refcount}\n")

            # 16. TC : Web of Science Core Collection Times Cited Count
            citecount = ab.citedby_count
            datafile.write(f"TC {citecount}\n")

            # 17. Z9 : Total Times Cited Count
            datafile.write(f"Z9 {citecount}\n")

            # 18. U1 : Usage Count (Last 180 Days)
            # 19. U2 : Usage Count (Since 2013)
            # 20. PU : Publisher = ELSEVIER SCI LTD
            # 21. PI : Publisher City = OXFORD
            # 22. PA : Publisher Address = THE BOULEVARD, LANGFORD LANE, KIDLINGTON, OXFORD OX5 1GB, OXON, ENGLAND
            # 23. SN : International Standard Serial Number (ISSN) = 0959-6526
            if ab.issn != None:
                datafile.write(f"SN {ab.issn}\n")

            # 24. EI : Electronic International Standard Serial Number (eISSN) = 1879-1786
            # 25. J9 : 29-Character Source Abbreviation = J CLEAN PROD
            if ab.sourcetitle_abbreviation != None:
                datafile.write(f"J9 {ab.sourcetitle_abbreviation.upper()}\n")

            # 26. JI : ISO Source Abbreviation = J. Clean Prod.
                datafile.write(f"JI {ab.sourcetitle_abbreviation}\n")

            # 27. PD : Publication Date = JUL 1
            if ab.coverDate != None:
                month = ab.coverDate.split('-')[1]
                date = ab.coverDate.split('-')[2]
                datafile.write(f"PD {calendar.month_name[int(month)][:3].upper()} {int(date)}\n")

            # 28. PY : Publication Year = 2020
            if scopus['Year'].iloc[artno] != None:
                datafile.write(f"PY {scopus['Year'].iloc[artno]}\n")

            # 29. VL : Volumn = 260
            if scopus['Volume'].iloc[artno] != None:
                datafile.write(f"VL {scopus['Volume'].iloc[artno]}\n")

            # 30. AR : Article Number = 121059
            if scopus['Art. No.'].iloc[artno] != None:
                datafile.write(f"AR {scopus['Art. No.'].iloc[artno]}\n")

            # 31. DI : Digital Object Identifier = 10.1016/j.jclepro.2020.121059
            if scopus['DOI'].iloc[artno] != None:
                datafile.write(f"DI {scopus['DOI'].iloc[artno]}\n")

            # 32. PG : Page Count = 14
            # 33. WC : Web of Science Categories = Green & Sustainable Science & Technology; Engineering, Environmental; Environmental Sciences
            # 34. SC : Research Areas = Science & Technology - Other Topics; Engineering; Environmental Sciences & Ecology
            if ab.subject_areas != None:
                tmp = pd.DataFrame(ab.subject_areas)
                tmp_ = tmp["area"].tolist()
                datafile.write("SC " + "; ".join(tmp_) + "\n")
            # 35. GA : Document Delivery Number = LL4XH
            # 36. UT : Accession Number = WOS:000531559900003
            # 37. DA : Date this report was generated. = 2020-06-14
            datafile.write(f"ER\n\n")

with open(filename, mode) as datafile:
    datafile.write(f"EF")

Scopus401Error: The requestor is not authorized to access the requested view or fields of the resource

### 2.3. Extract `keywords`
* crawling WOS file and save as "keywords.txt"

In [25]:
scopus = pd.read_csv("scopus.csv")

keywords = []
line = ''
count = 0

outfile = open("keywords.txt", "w")
outfile.close()

with open("BIPV_ML_all.txt") as f:
    while line != "EF":
        line = f.readline()
        if line[:3] == "DE ":
            count += 1
            if count%1000 == 0:
                print(f"count= {count}")
            keywords_new = line[3:].rstrip('\n').split("; ")
            
            with open("keywords.txt", "a") as outfile:
                outfile.write(f'# {count}\n')
                for keyword_new in keywords_new:
                    outfile.write(f'{keyword_new}\n')
            
            keywords += keyword_new

count= 1000
count= 2000
count= 3000
count= 4000
count= 5000
count= 6000
count= 7000
count= 8000


## 3. Keyword Cleaninsing

* store keywords in `keywords`
* save `keywords` as `"keywords.txt"`

In [5]:
keywords = []
line = ''

with open("BIPV_ML_all.txt") as f:
    while line != "EF":
        line = f.readline()
        if line[:3] == "DE ":
            keywords_new = line[3:].rstrip('\n').split(";")
            for keyword_new in keywords_new:
                keywords += [keyword_new.rstrip(' ').lstrip(' ')]

keywords = sorted(keywords)
with open(keyword_org_name, "w", encoding="utf-8-sig") as f:
    for keyword in keywords:
        f.write(f"{keyword}\n")

* Precedure


1. get unique keywords
2. detect and extract abbreviations as word in pharenthesis, starting from upper character: store in `keywords_abb_key_cand`.
3. delete "dirties" on keywords : "-" and ":" on each side of them, and store in `keywords_value`
4. create abbreviation dictionary: `keywords_abb`
5. remove mathematics and unicodes from `keywords_value`
6. remove needless blanks from `keywords_value`
7. create single-word keywords, remove plural forms and save as `keywords_single.txt`
8. save as files
  - `keywords_abb`: "keywords_abb.json"
  - `keywords_single`: "keywords_single.txt"
  - `keywords_dict`: "keywords_dict.json"

In [4]:
# unicode dictionary
with open('unicode_dict.json') as j:
    unicode_dict = json.load(j)

In [6]:
# retreive keywords
keywords_raw_ = np.genfromtxt("keywords.txt", dtype="str", delimiter="\n", encoding='utf-8-sig') #utf-8-sig for \ufeff removal

# get unique ones
keywords_raw = np.unique(keywords_raw_)

# abbriviations detection (key candidates)
keywords_abb = {}
keywords_abb_key_cand = []
for keyword_raw_ in keywords_raw_:
    kwr = keyword_raw_.split()
    for kw in kwr:
        kw = kw.lstrip("(").rstrip(")")
        if (kw == kw.upper()) \
           and (kw.lstrip("(").rstrip(")") not in keywords_abb_key_cand) \
           and (ord(kw[0]) >= ord('A') and ord(kw[0]) <= ord('Z')) \
           and (len(kw) > 1):
            keywords_abb_key_cand.append(kw.lstrip("(").rstrip(")").rstrip(":").rstrip(","))

# convert to lower cases
# keywords_raw = [kw.lower() for kw in keywords_raw_]

# duplicate for values, and convert to lower cases
keywords_value = deepcopy(keywords_raw)

# convert to lower cases
keywords_value = [kw.lower() for kw in keywords_value]

# remove staring characters
keywords_value = [kw.lstrip('-') for kw in keywords_value]
keywords_value = [kw.rstrip('-') for kw in keywords_value]
keywords_value = [kw.lstrip(':') for kw in keywords_value]
keywords_value = [kw.rstrip(':') for kw in keywords_value]

# remove brakets "()"
for kc in keywords_abb_key_cand:
    kc_ = kc.lower()
    for kw in keywords_value:
        if kc_ in kw.replace('(', "").replace(')', "").split():
            kc_value = re.sub(r'\([^)]*\)', "", kw).rstrip(' ').lstrip(' ')
            if (kc_ not in kc_value) and \
               (kc_value[-1] != ")") and \
               (len(kc_value.split()) > 1) and \
               ((kc not in list(keywords_abb.keys())) or (kc in list(keywords_abb.keys()) and len(kc_value) < len(keywords_abb[kc]) and len(kc_value.split()) > 1)):
                kc_value = kc_value.replace("  ", " ")
                kc_value = ' '.join(kc_value.split(' ')[:-1] + [Lem.lemmatize(kc_value.split(' ')[-1])])
                keywords_abb.update({kc:kc_value})

keywords_value = [re.sub(r'\([^)]*\)', "", kw).rstrip(' ').lstrip(' ').lstrip(')').lstrip('(') for kw in keywords_value]

# remove unicodes –
unicode_keys = list(unicode_dict.keys())
for ukey in unicode_keys:
        keywords_value = [kw.replace(ukey, unicode_dict[ukey]) for kw in keywords_value]

# remove mathematics
keywords_value = [kw.replace("\\infty", "infinity") for kw in keywords_value]
keywords_value = [kw.replace("\\mathrm{", "") for kw in keywords_value]
keywords_value = [kw.replace("{", "") for kw in keywords_value]
keywords_value = [kw.replace("}", "") for kw in keywords_value]

# remove '"'
keywords_value = [kw.replace('"', "") for kw in keywords_value]

# reduce needless blanks.
keywords_value = [re.sub("\s+", " ", kw) for kw in keywords_value]
keywords_value = [kw.replace(" -", "-") for kw in keywords_value]
keywords_value = [kw.replace("- ", "-") for kw in keywords_value]
keywords_value = [kw.replace(" //", "//") for kw in keywords_value]
keywords_value = [kw.replace("// ", "//") for kw in keywords_value]

# create single-word keyword
keywords_single = [k for k in list(keywords_abb.keys())]
for kws in keywords_value:
    if len(kws.split(" ")) == 1 and len(kw) <= 4:
        keywords_single.append(kws)
    for kw in kws:
        if len(kw.split("-")) == 1 and len(kw) <= 4 and len(kw) > 1:
            keywords_single.append(kw)

keywords_single = list(np.unique(sorted(keywords_single)))

keywords_single_ = deepcopy(keywords_single)       
for kw in keywords_single_:
    if kw+"s" in keywords_single and len(kw) >= 4:
        print(kw+"s")
        keywords_single.remove(kw+"s")
    if kw[-1]+"ies" in keywords_single and kw[-1] == "y" and len(kw >=4):
        print(kw+"s")
        keywords_single.remove(kw+"s")

# create abbreviation dictionary
abb_sorted = dict(sorted(keywords_abb.items()))
with open(keyword_abb_name, "w") as j:
    json.dump(abb_sorted, j, ensure_ascii=False, indent=2)
    
# create "single" keywords list
with open(keyword_single_name, "w", encoding="utf-8-sig") as f:
    for keyword in keywords_single:
        f.write(f"{keyword}\n")

# create "value"
keywords_dict = dict(zip(keywords_raw, keywords_value))
with open(keyword_dict_name, "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)        

aerosols
aggregators
agrivoltaics
algorithms
barriers
bifurcations
buildings
characteristics
choppers
clouds
consumers
controllers
converters
correlations
costs
delays
dielectrics
diodes
dislocations
dynamics
economics
eigenvalues
electrolyzers
emissions
ensembles
experiments
failures
faults
feed-in-tariffs
forecasts
greenhouses
harmonics
heterojunctions
households
hurricanes
ibscs
igbts
imports
inverters
irradiances
measurements
metaheuristics
micro-grids
microgrids
microinverters
microturbines
mini-grids
modes
models
modules
multi-junctions
nano-grids
nanofluids
nanowires
optoelectronics
performances
perovskites
photodetectors
photodiodes
photovoltaics
prosumers
relays
renewables
resonances
scenarios
semiconductors
sensors
simulations
simulators
supercapacitors
systems
techno-economics
technoeconomics
thermodynamics
thermophotovoltaics
transients
trends
voltages
waves
wavelets


### 3.1. [manual] refine `keywords_single`
* remove plural words not to be kept

### 3.2. create single-plural words dictionary
* convert plural words in `keywords_dict`, overwrite as `keywords_dict.json`
* create plural-singular dictionary: `keywords_single.txt`

In [7]:
# retrieve keywords_value
with open("keywords_dict.json", "r") as j:
    keywords_dict = json.load(j)
keywords_value = list(keywords_dict.values())
keywords_key = list(keywords_dict.keys())

# retreive keywords_single
keywords_single = np.genfromtxt("keywords_single_refine.txt", dtype="str", delimiter="\n", encoding='utf-8-sig') #utf-8-sig for \ufeff removal
keywords_single = [k.lower() for k in keywords_single]

# Convert plural to singular
Lem = WordNetLemmatizer()
keywords_plural = {}
keywords_value_ = []
for kws in keywords_value:
    if (kws not in keywords_single) and (len(kws.split(" ")) > 1):
        kws_0 = kws.split(" ")
        
        for kws_1 in kws_0:
            kws_2 = kws_1.split("-")
            for kw in kws_2:
                if (kw not in keywords_single) and (kw != Lem.lemmatize(kw)):
                    kws = kws.replace(kw, Lem.lemmatize(kw))
                    keywords_plural.update({kw: Lem.lemmatize(kw)})
        keywords_value_.append(kws)
    else:
        keywords_value_.append(kws)

keywords_value = deepcopy(keywords_value_)
keywords_value = [kw.lstrip(' ').rstrip(' ') for kw in keywords_value]

    
# create plural-singular dictionary
plural_sorted = dict(sorted(keywords_plural.items()))
with open(keyword_plural_name, "w") as j:
    json.dump(plural_sorted, j, ensure_ascii=False, indent=2)
    
# create "value"
keywords_dict = dict(zip(keywords_key, keywords_value))
with open(keyword_dict_name, "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)        

### 3.3. create dash-words dictionary
* extract words with dash('-') from keywords_dict, save as `keywords_dash.json`

In [8]:
# retrieve keywords_value
with open("keywords_dict.json", "r") as j:
    keywords_dict = json.load(j)
keywords_value = list(keywords_dict.values())

# create dash dictionary
keywords_dash = []
for kw in keywords_value:
    kw_ = kw.split(" ")
    for kw__ in kw_:
        if ('-' in kw__) or ('//' in kw__):
            keywords_dash.append(kw__)
    
keywords_dash = np.unique(keywords_dash)

# create "dash" keywords dictionary - manual refinement required
keywords_dash_dict = dict(zip(keywords_dash, keywords_dash))
with open(keyword_dash_name, "w") as j:
    json.dump(keywords_dash_dict, j, ensure_ascii=False, indent=2)



### 3.4. [manual] keywords-abb refinement
* incorrect names, typo errors
* unifying same words

### 3.5. [manual] keywords-dash refinement
* incorrect names, typo errors
* unifying same words (ex. photo-voltaic and photovoltaic)

In [9]:
with open("keywords_plural.json", "r") as j:
    keywords_plural = json.load(j)

with open("keywords_abb_refine.json", "r") as j:
    keywords_abb_refine = json.load(j)

with open("keywords_dash_refine.json", "r") as j:
    keywords_dash_refine = json.load(j)
    

In [10]:
abb_refine_keys = list(keywords_abb_refine.keys())
dash_refine_keys = list(keywords_dash_refine.keys())

In [11]:
keywords_dash_refine2 = {}

plural_keys = list(keywords_plural.keys())

for dkey in dash_refine_keys:
    dvalues = keywords_dash_refine[dkey].split("; ")
    
    dvalues_abb = []
    dvalues_rms = []
    for dvalue in dvalues:
        dvalue_rem = deepcopy(dvalue)
        for akey in abb_refine_keys:
            if dvalue == akey.lower():
                dvalues.remove(dvalue)
                dvalues.append(keywords_abb_refine[akey])
            else:
                [dvalues_abb.append(keywords_abb_refine[d.upper()]) for d in dvalue.split("-") if d == akey.lower()]
                [dvalues_abb.append(keywords_abb_refine[d.upper()]) for d in dvalue.split("/") if d == akey.lower()]

    keywords_dash_refine2.update({dkey:dvalues+dvalues_abb})

with open("keywords_dash_refine2.json", "w") as j:
    json.dump(keywords_dash_refine2, j, ensure_ascii=False, indent=2)
                
            

### 3.6. [manual] keywords_dash2 refinement
* incorrect abbs treatement

### 3.7. apply keywords-dash on keywords_value
* (1) replace keywords_dash_refine.keys() to keywords_dash_refine.values()
* (2) if keywords_dash_refine2.key() has more than 2 elements, add them.

In [12]:
with open("keywords_dash_refine2.json", "r") as j:
    keywords_dash_refine2 = json.load(j)
    

keywords_value_ = []
for kws in keywords_value:
    dash_flag = 0
    kws_ = []
    for dkey in dash_refine_keys:
        
        if dkey in kws:
            kws_ += [kws.replace(dkey, keywords_dash_refine[dkey])]
            dash_flag += 1
            
            if len(keywords_dash_refine2[dkey]) > 1:
                [kws_.append(v) for v in keywords_dash_refine2[dkey][1:]]
                
    if dash_flag == 0:
        kws_ = [kws]
    
    keywords_value_.append(list(np.unique(kws_)))

keywords_dict = dict(zip(list(keywords_dict.keys()), keywords_value_))
with open(keyword_dict_name, "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)      

### 3.8. find and extract dash and abbriviations
* unify in-fact same words, and create `keywords_dict_add.json`

In [13]:
%%time

import itertools

abb_refine_keys = [k.lower() for k in np.unique(list(keywords_abb_refine.keys()))]
abb_refine_values = [v.lower() for v in np.unique(list(keywords_abb_refine.values()))]

with open("keywords_dict.json", "r") as j:
    keywords_dict = json.load(j)    
keywords_value = list(keywords_dict.values())

keywords_value_add = []
for kw in keywords_value:
    kw_0 = kw
    kw_1 = list(itertools.chain.from_iterable([kw_.split() for kw_ in kw]))
    kw_2 = list(itertools.chain.from_iterable([kw_.split("-") for kw_ in kw]))
    kw_3 = list(itertools.chain.from_iterable([kw_.split("/") for kw_ in kw]))
    
    keywords_value_add_ = []
    
    # "dash"
    for dash_keys in dash_refine_keys:
        if dash_keys in kw_1:
            [keywords_value_add_.append(kw) for kw in keywords_dash_refine2[dash_keys]]
        
        dash_keys_ = dash_keys.replace("-", "")
        if dash_keys_ in kw_1:
            if not (dash_keys_ in abb_refine_keys): # original abbriviation should be kept
                [keywords_value_add_.append(kw) for kw in keywords_dash_refine2[dash_keys]]

        dash_keys_ = dash_keys.split("-")
        if not (dash_keys_ in abb_refine_keys): # original abbriviation should be kept
            try:
                tmp = [kw_1.index(dash_key_) for dash_key_ in dash_keys_]
                if tmp == sorted(tmp) and (len(np.unique(tmp)) == len(tmp)):
                    [keywords_value_add_.append(kw) for kw in keywords_dash_refine2[dash_keys]]
            except ValueError:
                try:
                    tmp = [kw_2.index(dash_key_) for dash_key_ in dash_keys_]
                    if tmp == sorted(tmp) and (len(np.unique(tmp)) == len(tmp)):
                        [keywords_value_add_.append(kw) for kw in keywords_dash_refine2[dash_keys]]
                except ValueError:
                    pass
    
    # "abb"
    for abb_vals in abb_refine_values:
        if abb_vals in kw and \
           (abb_vals.replace(" ", "-") not in keywords_value_add) and \
           (abb_vals.replace("-", " ") not in keywords_value_add):
            keywords_value_add_.append(abb_vals)
    
    for abb_keys in abb_refine_keys:
        if abb_keys in (kw_0 + kw_1 + kw_2 + kw_3):
            keywords_value_add_.append(keywords_abb_refine[abb_keys.upper()])
    
    keywords_value_add.append(list(np.unique(keywords_value_add_)))

# create "additional" keywords dictionary
keywords_dict_add_dashabb = dict(zip(keywords_raw, keywords_value_add))
with open("keywords_dict_add_dashabb.json", "w") as j:
    json.dump(keywords_dict_add_dashabb, j, ensure_ascii=False, indent=2)  

CPU times: user 6min 46s, sys: 78.2 ms, total: 6min 46s
Wall time: 6min 46s


### 3.9. [manual] create dictionary for manual replacement.
* `keywords_manual.json`

### 3.10. update manual change on keywords
* Update `keywords_manual` on `keywords_dict` and `keywords_dict_add_dashabb`, save as `keywords_dict2.json` and `keywords_dict_add_dashabb2.json`

In [14]:
with open("keywords_manual.json", "r") as j:
    keywords_manual = json.load(j)
    
with open("keywords_dict_add_dashabb.json", "r") as j:
    keywords_dict_add_dashabb = json.load(j)
    
with open("keywords_dict.json", "r") as j:
    keywords_dict = json.load(j)    

In [15]:
manual_keys = list(keywords_manual.keys())
dict_values = list(keywords_dict.values())
add_dashabb_values = list(keywords_dict_add_dashabb.values())

In [16]:
dict_values_update = []

for values in dict_values:
    value_new = []
    for value in values:
        for mkey in manual_keys:
            mkey_ = mkey.lower()
            value = value.replace(mkey_, keywords_manual[mkey])
        value_new.append(value)
    dict_values_update.append(value_new)
    
keywords_dict = dict(zip(keywords_raw, dict_values_update))
with open("keywords_dict2.json", "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)        

In [17]:
add_dashabb_values_update = []

for values in add_dashabb_values:
    value_new = []
    for value in values:
        for mkey in manual_keys:
            mkey_ = mkey.lower()
            value = value.replace(mkey_, keywords_manual[mkey])
        value_new.append(value)
    add_dashabb_values_update.append(value_new)
    
keywords_dict_add_dashabb = dict(zip(keywords_raw, add_dashabb_values_update))
with open("keywords_dict_add_dashabb2.json", "w") as j:
    json.dump(keywords_dict_add_dashabb, j, ensure_ascii=False, indent=2)      

### 3.10. [manual] check still remaining same words

In [18]:
import itertools

merged = list(itertools.chain.from_iterable([dict_values_update, add_dashabb_values_update]))
merged = list(itertools.chain.from_iterable(merged))
merged_u = np.unique(merged, return_counts=True)

words_u = merged_u[0]
   

df_words = pd.DataFrame({"keywords": merged_u[0],
                         "counts": merged_u[1]
                        }).sort_values("counts", ascending=False)
df_words.head(10)

Unnamed: 0,keywords,counts
10224,photovoltaic,1731
5291,generation,257
8125,micro-grid,155
11612,pv-system,112
4287,energy-storage,108
6915,irradiance,106
5601,grid-connected,92
10982,power-point,91
7930,maximum-power-point,85
8585,mpp tracker,85


#### 3.10.(1) same words with and without dash("-"): insert dash

In [19]:
keywords_dash_update = {}

count = 0
for word in words_u:
    word_wo_dash = word.replace("-", "")
    if (word_wo_dash in words_u) and ("-" in word) :
        count += 1
        
        det_0 = all([len(w) > 5 for w in word.split(" ") if "-" in w])
        det_1 = all([len(w) >= 2 for w in word.split("-")]) 
#         print(count, word, det_0, det_1)
        
        if det_0 and det_1:
            keywords_dash_update.update({word_wo_dash: word})

# print(keywords_dash_update)
print(count)

125


In [20]:
list(keywords_dash_update.keys())[:10]

['ac microgrid',
 'antireflection',
 'antireflection coating',
 'asymmetrical fault',
 'bidirectional',
 'bidirectional converter',
 'bidirectional dc/dc converter',
 'bifacial',
 'cogeneration',
 'cosimulation']

In [21]:
dash_update_keys = list(keywords_dash_update.keys())
dict_values_update_ = deepcopy(dict_values_update)

count = 0
for i, vs in enumerate(dict_values_update):
    for j, v in enumerate(vs):
        for key in dash_update_keys:
            if v == key:
                count += 1
                dict_values_update_[i][j] = keywords_dash_update[key]

print(count)
dict_values_update = deepcopy(dict_values_update_)

keywords_dict = dict(zip(keywords_raw, dict_values_update_))
with open("keywords_dict3.json", "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)      

167


In [22]:
dash_update_keys = list(keywords_dash_update.keys())
add_dashabb_values_update_ = deepcopy(add_dashabb_values_update)

count = 0
for i, vs in enumerate(add_dashabb_values_update):
    for j, v in enumerate(vs):
        for key in dash_update_keys:
            if v == key:
                count += 1
                add_dashabb_values_update_[i][j] = keywords_dash_update[key]

print(count)
add_dashabb_values_update = deepcopy(add_dashabb_values_update_)

keywords_dict_add_dashabb = dict(zip(keywords_raw, add_dashabb_values_update_))
with open("keywords_dict_add_dashabb3.json", "w") as j:
    json.dump(keywords_dict_add_dashabb, j, ensure_ascii=False, indent=2)      

39


In [23]:
merged = list(itertools.chain.from_iterable([dict_values_update_, add_dashabb_values_update_]))
merged = list(itertools.chain.from_iterable(merged))
merged_u = np.unique(merged, return_counts=True)

words_u = merged_u[0]
   

df_words = pd.DataFrame({"keywords": merged_u[0],
                         "counts": merged_u[1]
                        }).sort_values("counts", ascending=False)
df_words.head(10)

Unnamed: 0,keywords,counts
10153,photovoltaic,1731
5265,generation,257
8089,micro-grid,167
11536,pv-system,112
4266,energy-storage,108
6884,irradiance,106
5575,grid-connected,93
10907,power-point,91
7894,maximum-power-point,85
8542,mpp tracker,85


#### 3.10.(2) same words with and without space(" "): remove space

In [24]:
keywords_space_update = {}

count = 0
for word in words_u:
    word_wo_space = word.replace(" ", "")
    if (word_wo_space in words_u) and (" " in word) :
        count += 1
        
        det_0 = all([len(w) > 5 for w in word.split(" ") if "-" in w])
        det_1 = all([len(w) >= 2 for w in word.split("-")]) 
#         print(count, word, det_0, det_1)
        
        if det_0 and det_1:
            keywords_space_update.update({word: word_wo_space})

# print(keywords_space_update)
print(count)

28


In [25]:
keywords_space_update

{'auto regressive': 'autoregressive',
 'back propagation': 'backpropagation',
 'co2 emission': 'co2emission',
 'dig silent': 'digsilent',
 'energy plan': 'energyplan',
 'energy plus': 'energyplus',
 'global grid': 'globalgrid',
 'hydro power': 'hydropower',
 'iec 61850': 'iec61850',
 'lab view': 'labview',
 'levenberg- marquardt': 'levenberg-marquardt',
 'light gbm': 'lightgbm',
 'mat lab': 'matlab',
 'matlab / simulink': 'matlab/simulink',
 'matlab/ simulink': 'matlab/simulink',
 'micro controller': 'microcontroller',
 'micro converter': 'microconverter',
 'micro generation': 'microgeneration',
 'micro source': 'microsource',
 'nano fluid': 'nanofluid',
 'p o': 'po',
 'perturb & observe': 'perturb&observe',
 'photo voltaic': 'photovoltaic',
 'photovoltaic/ thermal': 'photovoltaic/thermal',
 'power hardware-in-the-loop': 'powerhardware-in-the-loop',
 'ret screen': 'retscreen',
 'smart grid': 'smartgrid'}

In [26]:
with open("keywords_dict3.json", "r") as j:
    keywords_dict = json.load(j)    
    
space_update_keys = list(keywords_space_update.keys())
dict_values_update_ = deepcopy(dict_values_update)

count = 0
for i, vs in enumerate(dict_values_update):
    for j, v in enumerate(vs):
        for key in space_update_keys:
            if v == key:
                count += 1
                dict_values_update_[i][j] = keywords_space_update[key]

print(count)
dict_values_update = deepcopy(dict_values_update_)

keywords_dict = dict(zip(keywords_raw, dict_values_update_))
with open("keywords_dict3.json", "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)      

48


In [27]:
with open("keywords_dict_add_dashabb3.json", "r") as j:
    keywords_dict_dashabb3 = json.load(j)  

dash_update_keys = list(keywords_dash_update.keys())
add_dashabb_values_update_ = deepcopy(add_dashabb_values_update)

count = 0
for i, vs in enumerate(add_dashabb_values_update):
    for j, v in enumerate(vs):
        for key in dash_update_keys:
            if v == key:
                count += 1
                add_dashabb_values_update_[i][j] = keywords_dash_update[key]

print(count)
add_dashabb_values_update = deepcopy(add_dashabb_values_update_)

keywords_dict_add_dashabb = dict(zip(keywords_raw, add_dashabb_values_update_))
with open("keywords_dict_add_dashabb3.json", "w") as j:
    json.dump(keywords_dict_add_dashabb, j, ensure_ascii=False, indent=2)    

0


In [28]:
merged = list(itertools.chain.from_iterable([dict_values_update_, add_dashabb_values_update_]))
merged = list(itertools.chain.from_iterable(merged))
merged_u = np.unique(merged, return_counts=True)

words_u = merged_u[0]
   

df_words = pd.DataFrame({"keywords": merged_u[0],
                         "counts": merged_u[1]
                        }).sort_values("counts", ascending=False)
df_words.head(10)

Unnamed: 0,keywords,counts
10133,photovoltaic,1736
5260,generation,257
8072,micro-grid,167
11514,pv-system,112
4261,energy-storage,108
6876,irradiance,106
5569,grid-connected,93
10885,power-point,91
7881,maximum-power-point,85
8525,mpp tracker,85


#### 3.10.(3) same words with dash("-") and space(" "): insert dash

In [29]:
keywords_dash_update = {}

count = 0
for word in words_u:
    word_wo_dash = word.replace("-", " ")
    if (word_wo_dash in words_u) and ("-" in word) :
        count += 1
        
        det_0 = all([len(w) > 5 for w in word.split(" ") if "-" in w])
        det_1 = all([len(w) >= 2 for w in word.split("-")]) 
#         print(count, word, det_0, det_1)
        
        if det_0 and det_1:
            keywords_dash_update.update({word_wo_dash: word})

# print(keywords_dash_update)
print(count)

380


In [30]:
list(keywords_dash_update.keys())[:10]

['agent based model',
 'agent based modeling',
 'air conditioning',
 'amorphous silicon',
 'arc flash',
 'back to back converter',
 'black box model',
 'black box modeling',
 'black start',
 'boost converter']

In [31]:
with open("keywords_dict3.json", "r") as j:
    keywords_dict = json.load(j)    
    
dash_update_keys = list(keywords_dash_update.keys())
dict_values_update_ = deepcopy(dict_values_update)

count = 0
for i, vs in enumerate(dict_values_update):
    for j, v in enumerate(vs):
        for key in dash_update_keys:
            if v == key:
                count += 1
                dict_values_update_[i][j] = keywords_dash_update[key]

print(count)
dict_values_update = deepcopy(dict_values_update_)

keywords_dict = dict(zip(keywords_raw, dict_values_update_))
with open("keywords_dict4.json", "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)      

796


In [32]:
with open("keywords_dict_add_dashabb3.json", "r") as j:
    keywords_dict_dashabb3 = json.load(j)  

dash_update_keys = list(keywords_dash_update.keys())
add_dashabb_values_update_ = deepcopy(add_dashabb_values_update)

count = 0
for i, vs in enumerate(add_dashabb_values_update):
    for j, v in enumerate(vs):
        for key in dash_update_keys:
            if v == key:
                count += 1
                add_dashabb_values_update_[i][j] = keywords_dash_update[key]

print(count)
add_dashabb_values_update = deepcopy(add_dashabb_values_update_)

keywords_dict_add_dashabb = dict(zip(keywords_raw, add_dashabb_values_update_))
with open("keywords_dict_add_dashabb4.json", "w") as j:
    json.dump(keywords_dict_add_dashabb, j, ensure_ascii=False, indent=2)    

371


In [34]:
merged = list(itertools.chain.from_iterable([dict_values_update, add_dashabb_values_update]))
merged = list(itertools.chain.from_iterable(merged))
merged_u = np.unique(merged, return_counts=True)

words_u = merged_u[0]
   

df_words = pd.DataFrame({"keywords": merged_u[0],
                         "counts": merged_u[1]
                        }).sort_values("counts", ascending=False)
df_words.head(10)

Unnamed: 0,keywords,counts
9924,photovoltaic,1736
5159,generation,257
7901,micro-grid,176
4187,energy-storage,120
11270,pv-system,118
7715,maximum-power-point,113
6737,irradiance,106
10144,photovoltaic-thermal,103
13329,solar-photovoltaic,96
5450,grid-connected,96


#### 3.10.(4) apply `unique()` on values
* save results in `keywords_dict_u.json`

In [45]:
with open("keywords_dict4.json", "r") as j:
    keywords_dict = json.load(j)   
dict_values = list(keywords_dict.values())
dict_keys = list(keywords_dict.keys())

with open("keywords_dict_add_dashabb4.json", "r") as j:
    keywords_dict_dashabb = json.load(j)  
dict_dashabb_values = list(keywords_dict_dashabb.values())

value_unique = []
for i, (dict_value, dashabb_value) in enumerate(zip(dict_values, dict_dashabb_values)):
    v1 = list(itertools.chain.from_iterable([v.split("; ") for v in dict_value]))
    v2 = list(itertools.chain.from_iterable([v.split("; ") for v in dashabb_value]))
    value_unique.append(list(set(v1 + v2)))
    
keywords_dict_u = dict(zip(keywords_raw, value_unique))
with open("keywords_dict_u.json", "w") as j:
    json.dump(keywords_dict_u, j, ensure_ascii=False, indent=2)        

### 3.11. importing "knowledge"
#### 3.11.(1) create knowledge graph

In [117]:
keywords_knowledge_init = {
    "statistics": [
        "analysis of variance", 
        "autoregressive integrated moving average", 
        "autoregressive moving average", 
        "markov", 
        "kalman", 
        "bayes", 
        "gaussian process", 
        "autoregression", 
        "autoregressive", 
        "probabilistic", 
        "k-fold", 
        "ensemble", 
        "kruskal-wallis" 
    ], 

    "method": [
        "taguchi", 
        "averaging point method", 
        "describing function method", 
        "finite difference time domain method", 
        "finite element method", 
        "group method of data handling", 
        "incremental conductance method", 
        "match evaluation method", 
        "multiple-shifted-frequency method", 
        "numerical method", 
        "oblique asymptote method", 
        "response surface methodology", 
        "steepest descent method", 
        "nelder-mead", 
        "newton-raphson", 
        "runge-kutta", 
        "conditional interpolation", 
        "expectation-maximization", 
        "dynamic simulation", 
        "empirical", 
        "emulation", 
        "gradient descent", 
        "algorithm" 
    ],

    "metric": [
        "mean square error", 
        "mean absolute error", 
        "mean absolute percentage error", 
        "cross-entropy", 
        "least-square",
        "mean bias error"
    ], 

    "machine learning": [
        "clustering", 
        "regression", 
        "classification", 
        "dimension reduction", 
        "reinforcement learning", 
        "ensemble", 
        "natural language processing"
    ], 

    "doc2vec":[
        "natural language processing"
    ], 

    "reinforcement learning": [
        "sarsa", 
        "markov"
    ],
    
    "ensemble": [
        "random forest", 
        "adaboost",
        "bagging",
        "bootstrap", 
        "lightgbm", 
        "xgboost"
    ], 
    
    "data": [
        "database",
        "data acquisition",
        "data-mining",
        "data-driven",
        "data-based"
    ],
    
    "database": [
        "mapreduce",
        "sql",
        "hadoop"
    ],
    
    "classification": [
        "k-nearest neighbor",
        "support vector machine",
        "neural network"
    ],
    
    "regression": [ 
        "k-nearest neighbor",
        "support vector machine",
        "neural network",
        "ridge",
        "lasso",
        "autoregressive",
        "support vector regression"
    ],
    
    "clustering": [
        "k means", 
        "dbscan"
    ],
    
    "neural network": [
        "artificial neural network",
        "learning vector quantization",
        "recurrent neural network",
        "convolutional neural network",
        "autoencoder",
        "adaline",
        "artificial neural fuzzy inference system",
        "elman",
        "attention",
        "extreme learning machine",
        "multilayer perceptron",
    ],
    
    "recurrent neural network": [
        "long short-term memory",
        "boltzmann machine",
    ],
    
    "convolutional neural network": [
        "googlenet"
    ], 

    "dimension reduction": [
        "principal component analysis",
        "factor analysis",
        "autoencoder"
    ],
    
    "algorithm": [
        "agent-based",
        "ant colony", 
        "ant lion", 
        "artificial bee colony", 
        "artificial fish swarm", 
        "backtracking search", 
        "bacterial foraging", 
        "bat", 
        "bee pollinator", 
        "binary search", 
        "bio-inspired", 
        "bucket elimination", 
        "crow search", 
        "elite retention", 
        "evolutionary", 
        "firefly", 
        "fireworks explosion", 
        "flower pollination", 
        "fruitfly", 
        "genetic algorithm", 
        "golden section", 
        "grasshopper", 
        "gravity search", 
        "grey wolf", 
        "imperialist competition", 
        "jaya", 
        "leapfrog", 
        "honey bee mating", 
        "interior search", 
        "invasive weed", 
        "elephant herding", 
        "particle swarm", 
        "pattern search", 
        "perturb and observe", 
        "shuffled frog leaping", 
        "versatile threshold", 
        "monte carlo", 
        "rule-based", 
        "dynamic programming"
    ],
    
    "dynamic programming": [
        "reinforcement learning"
    ],
    
    "building-integrated": [
        "bapv",
        "bipv",
        "biss",
        "bispv",
        "bispvt",
        "bipv",
        "bipvt",
        "bipv/t",
        "bist",
        "bists"
    ],
    
    "thermal": [
        "bapvt",
        "bapv/t",
        "bispvt",
        "bipvt",
        "bipv/t",
        "bist",
        "bists"
    ],
    
    "photovoltaic": [
        "bapv",
        "bispv",
        "bispvt",
        "bipv",
        "biss",
        "bipv",
        "bipvt",
        "bipv/t",
        "bist",
        "bists",
        "agrivoltaic"
    ],
    
    "agriculture": [
        "agrivoltaic"
    ]
    
    
}

with open("keywords_knowledge_init.json", "w") as j:
    json.dump(keywords_knowledge_init, j, ensure_ascii=False, indent=2) 

In [118]:
kn_keys_init = list(keywords_knowledge_init.keys())
kn_values_init = list(keywords_knowledge_init.values())

In [119]:
kn_values_all = list(itertools.chain.from_iterable(kn_values_init))

#### 3.11.(2) Create knowledge graph
* apply DFS to create knowledge graph

In [120]:
def find_key(value):
    result_key = []
    for k in kn_keys_init:
        if value in keywords_knowledge_init[k]:
            result_key.append(k)
            
    return result_key

def dfs(graph, start_node):
    visit = []
    stack = []
    
    stack.append(start_node)
    
    while stack:
        node = stack.pop()
        if node not in visit:
            visit.append(node)
            stack.extend(find_key(node))
    return visit

In [121]:
keywords_knowledge = {}
for kn in kn_values_all:
    value_add = dfs(keywords_knowledge_init, kn)
    keywords_knowledge.update({kn: value_add})

In [122]:
keywords_knowledge

{'analysis of variance': ['analysis of variance', 'statistics'],
 'autoregressive integrated moving average': ['autoregressive integrated moving average',
  'statistics'],
 'autoregressive moving average': ['autoregressive moving average',
  'statistics'],
 'markov': ['markov',
  'reinforcement learning',
  'dynamic programming',
  'algorithm',
  'method',
  'machine learning',
  'statistics'],
 'kalman': ['kalman', 'statistics'],
 'bayes': ['bayes', 'statistics'],
 'gaussian process': ['gaussian process', 'statistics'],
 'autoregression': ['autoregression', 'statistics'],
 'autoregressive': ['autoregressive',
  'regression',
  'machine learning',
  'statistics'],
 'probabilistic': ['probabilistic', 'statistics'],
 'k-fold': ['k-fold', 'statistics'],
 'ensemble': ['ensemble', 'machine learning', 'statistics'],
 'kruskal-wallis': ['kruskal-wallis', 'statistics'],
 'taguchi': ['taguchi', 'method'],
 'averaging point method': ['averaging point method', 'method'],
 'describing function met

#### 3.11.(3) Create knowledge dictionary and save
* save as `keywords_dict_kn.json`

In [123]:
dict_values_kn = []
for values in dict_values:
    dict_values_kn_ = []

    for value in values:
        for kn_key in kn_values_all:        
            # multiple words (ex. monte carlo, rule-based):
            if len(kn_key.replace("-", " ").split(" ")) > 1:
                kn_key0 = "-".join(kn_key.replace("-"," ").replace("_", " ").split(" "))
                if kn_key0 in value.replace(" ","-").replace("_", "-"):
                    dict_values_kn_.extend(keywords_knowledge[kn_key])
                
            # single word (ex. ensemble)
            else:
                if kn_key in value.replace("-"," ").split(" "):
                    dict_values_kn_.extend(keywords_knowledge[kn_key])
            
    dict_values_kn.append(list(set(dict_values_kn_)))


keywords_dict_kn = dict(zip(dict_keys, dict_values_kn))
with open("keywords_dict_kn.json", "w") as j:
    json.dump(keywords_dict_kn, j, ensure_ascii=False, indent=2) 

### 3.12. Update frequent values
* "frequent" keywords should not be overlapped with "dash-words", "abbriviations" and "knowledge graph"

#### 3.12.(1) load data
* refine words in (dict values - dashabb - knowledge)

In [124]:
with open("keywords_dict.json", "r") as j:
    keywords_dict = json.load(j)   
dict_values = list(keywords_dict.values())
dict_keys = list(keywords_dict.keys())

with open("keywords_abb_refine.json", "r") as j:
    keywords_dict_abb = json.load(j)  
dict_abb_keys = list(keywords_dict_abb.keys())
dict_abb_values = list(keywords_dict_abb.values())

with open("keywords_dash_refine2.json", "r") as j:
    keywords_dict_dash = json.load(j)  
dict_dash_values = list(keywords_dict_dash.values())
dict_dash_values_u = list(itertools.chain.from_iterable(dict_dash_values))

with open("keywords_dict_kn.json", "r") as j:
    keywords_dict_kn = json.load(j)  
dict_kn_values = list(keywords_dict_kn.values())

with open("keywords_manual.json", "r") as j:
    keywords_dict_man = json.load(j)  
dict_man_keys = list(keywords_dict_man.keys())

In [125]:
dict_values[90:110]

[['ac impedance model'],
 ['ac micro-grid'],
 ['ac microgrid'],
 ['ac microgrids'],
 ['ac microgrids'],
 ['ac modelling'],
 ['ac module'],
 ['ac shunted system and matlab/simulink'],
 ['ac small-signal model'],
 ['ac-dc microgrid'],
 ['ac-dc power converter'],
 ['ac-stacked pv inverter'],
 ['ac-stacked pv inverter'],
 ['ac-stacked inverter'],
 ['ac-stacked inverter'],
 ['ac-stacked photovoltaic inverter'],
 ['ac/dc grid'],
 ['ac/dc hybrid microgrid'],
 ['ac/dc hybrid system'],
 ['ac algorithm']]

In [126]:
dict_dash_values[90:110]

[['boost-self'],
 ['boost-type'],
 ['bos-costs'],
 ['bottom-up'],
 ['bouguer-lambert'],
 ['box-behnken'],
 ['artificial neural network'],
 ['bpfpa-bee'],
 ['branch-and-bound'],
 ['bridge-link'],
 ['bridge-linked'],
 ['brute-force'],
 ['buck-boost'],
 ['building-applied'],
 ['building-integrated'],
 ['building-integration'],
 ['built-in'],
 ['buk-boost'],
 ['buoyancy-induced'],
 ['c-c']]

#### 3.12.(2) extract keywords

In [127]:
values_freq = []
for values in dict_values:
    value_net = []
    dash_net = []
    abb_net = []
    
    values_summary = []
    for value in values:
        
        # abbriviation detection
        value_ = value.replace(" ", "-").replace("/", "-")
        value_split = value_.split("-")
        value_assemble = []
        abb_removal = []
        for value_s in value_split:
            for abb in dict_abb_keys:
                abb_ = abb.lower()
                if abb_ in value_split and keywords_dict_abb[abb] not in abb_net:
                    abb_net.append(keywords_dict_abb[abb])
                    abb_removal.append(abb_)
                
        value_filtered = "-".join(["" if v in abb_removal else v for v in value_split]).rstrip("-").lstrip("-")
        
        # dash-words detection
        dash_removal = []
        for dash in dict_dash_values_u:
            dash0 = dash.replace("-", "")
            dash1 = dash.replace("-", " ")
            if (dash in value_filtered) and (dash not in dash_net) and len(dash)>=4:
                dash_net.append(dash)
                dash_removal.append(dash)
            elif (dash0 in value_filtered) and (dash not in dash_net) and len(dash)>=4:
                dash_net.append(dash)
                dash_removal.append(dash0)
            elif (dash1 in value_filtered) and (dash not in dash_net) and len(dash)>=4:
                dash_net.append(dash)
                dash_removal.append(dash1)
        
        for dash_rm in dash_removal:
            value_filtered = value_filtered.replace(dash_rm, "")
        
        value_net.extend([v for v in value_filtered.split("-") if len(v)>=2])
    
        # manual correction
        value_sum = value_net + abb_net + dash_net
        for i, v in enumerate(value_sum):
            for mkey in dict_man_keys:
                mkey_ = mkey.lower()
                v_re = v.replace(mkey_, keywords_dict_man[mkey])
                value_sum[i] = v_re
        
        values_summary.extend(list(set(value_sum)))
    
    values_freq.append(values_summary)

In [128]:
keywords_dict_kw = dict(zip(dict_keys, values_freq))
with open("keywords_dict_kw.json", "w") as j:
    json.dump(keywords_dict_kw, j, ensure_ascii=False, indent=2) 

#### 3.12.(3) sort keywords with counts

In [129]:
with open("keywords_dict_kw.json", "r") as j:
    keywords_dict_kw = json.load(j)  
dict_kw_values = list(keywords_dict_kw.values())

values_freq = list(itertools.chain.from_iterable(dict_kw_values))
values_freq = np.unique(values_freq, return_counts=True)    

df_words = pd.DataFrame({"keywords": values_freq[0],
                         "counts": values_freq[1]
                        }).sort_values("counts", ascending=False).reset_index(drop=True)
df_words.head(20)

Unnamed: 0,keywords,counts
0,photovoltaic,1829
1,system,1233
2,power,898
3,model,864
4,solar,848
5,energy,755
6,modeling,483
7,control,454
8,and,405
9,network,329


#### 3.12.(4) apply 'stopwords'
* remove meaningless words by 'stop words'

In [130]:
stop_words = ["and", "of", "the", "to", "on", "in", "based", "base"]

for sword in stop_words:
    idx = df_words[df_words["keywords"] == sword ].index
    df_words.drop(idx, axis=0, inplace=True)
    
for i in range(ord("a"), ord("z")+1):
    idx = df_words[df_words["keywords"] == chr(i)].index
    df_words.drop(idx, axis=0, inplace=True)

df_words.head(20)

Unnamed: 0,keywords,counts
0,photovoltaic,1829
1,system,1233
2,power,898
3,model,864
4,solar,848
5,energy,755
6,modeling,483
7,control,454
9,network,329
11,converter,304


#### 3.12.(3) check ranks of some important words

In [131]:
np.where(df_words["keywords"]=="shadow")

(array([571]),)

In [132]:
np.where(df_words["keywords"]=="building")

(array([37]),)

In [133]:
np.where(df_words["keywords"]=="based")

(array([], dtype=int64),)

In [134]:
np.where(df_words["keywords"]=="building-integrated")

(array([161]),)

In [135]:
words_single_1000 = df_words["keywords"].iloc[:1000].tolist()
print(words_single_1000[:35])
# print(words_single_1200[-20:])

['photovoltaic', 'system', 'power', 'model', 'solar', 'energy', 'modeling', 'control', 'network', 'converter', 'analysis', 'hybrid', 'algorithm', 'gen-eration', 'cell', 'inverter', 'prediction', 'thermal', 'optimization', 'battery', 'modelling', 'load', 'simulation', 'method', 'voltage', 'module', 'renewable', 'distributed', 'grid', 'wind', 'dynamic', 'forecasting', 'distribution', 'management', 'micro-grid']


In [136]:
np.where(df_words["counts"]>30)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

#### 3.12.(4) extract top N words
* extract words in top 1000
* save the words in `keywords_dict_freq.json`

In [137]:
with open("keywords_dict_kw.json", "r") as j:
    keywords_dict_kw = json.load(j)  
dict_kw_values = list(keywords_dict_kw.values())

In [138]:
words_manual = []
words_select = words_single_1000 + words_manual

freq_values = []
for values in dict_kw_values :
    values_add = []
    for w in words_select:
        values_add.append([w for v in values if w in v.replace("-", " ").split(" ")])
    freq_values.append(list(set(itertools.chain.from_iterable(values_add))))

print(len(freq_values))
keywords_dict_freq = dict(zip(list(keywords_dict_kw.keys()), freq_values))
with open("keywords_dict_freq.json", "w") as j:
    json.dump(keywords_dict_freq, j, ensure_ascii=False, indent=2)      

19457


In [139]:
keywords_dict_freq["artificial neural network"]

['artificial', 'network', 'neural']

## 4. Keywords Summing up 
* `dict_u` + `dict_kw` + `dict_kn`

In [140]:
with open("keywords_dict_freq.json", "r") as j:
    keywords_dict_freq = json.load(j)   
dict_freq_values = list(keywords_dict_freq.values())

### 4.1. gathering cleansed keywords

In [141]:
df_dict_sum = pd.DataFrame({"0_keywords": dict_keys,
                            "1_unique": dict_values,
                            "2_freq": dict_freq_values,
                            "3_knowledge": dict_kn_values
                           })
df_dict_sum.to_csv("keywords_summary.csv", index=False)
df_dict_sum.head(15)

Unnamed: 0,0_keywords,1_unique,2_freq,3_knowledge
0,"""Economic benefits""",[economic benefit],[economic],[]
1,"""Finance lease""",[finance lease],[],[]
2,"""Household distributed photovoltaic""",[household distributed photovoltaic],"[photovoltaic, household, distributed]",[]
3,"""LV networks",[lv network],[network],[]
4,"""Photovoltaic loan""",[photovoltaic loan],[photovoltaic],[]
5,% free PSA (% FPSA),[% free psa],[free],[]
6,"(EPV,Qd) operating points",[operating point],"[operating, point]",[]
7,(PV) Photovoltaic panel,[photovoltaic panel],"[photovoltaic, panel]",[]
8,(V-P) voltage power,[voltage power],"[voltage, power]",[]
9,-Artificial-Neural-Network,[artificial neural network],"[artificial, network, neural]","[neural network, machine learning, classificat..."


### 4.2. cleaning redundancy

In [142]:
dict_freq_values[10]

['artificial', 'algorithm', 'network', 'neural']

In [143]:
dict_kn_values[10]

['method',
 'neural network',
 'machine learning',
 'algorithm',
 'classification',
 'regression',
 'artificial neural network']

In [144]:
kn_value = ['artificial neural network']
freq_value = ['network', 'artificial']

for kn in kn_value:
    print(len(set(kn.split(" ")) - set(freq_value)))

1


In [145]:
freq_values_refine = []
for i, (freq_value, kn_value) in enumerate(zip(dict_freq_values, dict_kn_values)):
    value_rm = []
    for kn in kn_value:
        if len(set(kn.split(" ")) - set(freq_value)) == 0:
            value_rm.extend(kn.split(" "))
        if len(set(kn.split("-")) - set(freq_value)) == 0:
            value_rm.extend(kn.split(" "))
    
    freq_values_refine.append(list(set(freq_value)-set(value_rm)))

df_dict_sum = pd.DataFrame({"0_keywords": dict_keys,
                            "1_unique": dict_values,
                            "2_freq1000": freq_values_refine,
                            "3_knowledge": dict_kn_values
                           })
df_dict_sum.to_csv("keywords_summary.csv", index=False)
df_dict_sum.head(15)

Unnamed: 0,0_keywords,1_unique,2_freq1000,3_knowledge
0,"""Economic benefits""",[economic benefit],[economic],[]
1,"""Finance lease""",[finance lease],[],[]
2,"""Household distributed photovoltaic""",[household distributed photovoltaic],"[photovoltaic, household, distributed]",[]
3,"""LV networks",[lv network],[network],[]
4,"""Photovoltaic loan""",[photovoltaic loan],[photovoltaic],[]
5,% free PSA (% FPSA),[% free psa],[free],[]
6,"(EPV,Qd) operating points",[operating point],"[operating, point]",[]
7,(PV) Photovoltaic panel,[photovoltaic panel],"[photovoltaic, panel]",[]
8,(V-P) voltage power,[voltage power],"[voltage, power]",[]
9,-Artificial-Neural-Network,[artificial neural network],[],"[neural network, machine learning, classificat..."


### 4.2. create summation of keywords

In [146]:
df_dict_sum["2_or_3"] = df_dict_sum["2_freq1000"] + df_dict_sum["3_knowledge"]
df_dict_sum["2_or_3"].apply(np.unique)

# df_dict_sum["1_or_2_or_3"] = df_dict_sum["1_unique"] + df_dict_sum["2_or_3"]
# df_dict_sum["1_or_2_or_3"].apply(np.unique)
df_dict_sum.head(20)

Unnamed: 0,0_keywords,1_unique,2_freq1000,3_knowledge,2_or_3
0,"""Economic benefits""",[economic benefit],[economic],[],[economic]
1,"""Finance lease""",[finance lease],[],[],[]
2,"""Household distributed photovoltaic""",[household distributed photovoltaic],"[photovoltaic, household, distributed]",[],"[photovoltaic, household, distributed]"
3,"""LV networks",[lv network],[network],[],[network]
4,"""Photovoltaic loan""",[photovoltaic loan],[photovoltaic],[],[photovoltaic]
5,% free PSA (% FPSA),[% free psa],[free],[],[free]
6,"(EPV,Qd) operating points",[operating point],"[operating, point]",[],"[operating, point]"
7,(PV) Photovoltaic panel,[photovoltaic panel],"[photovoltaic, panel]",[],"[photovoltaic, panel]"
8,(V-P) voltage power,[voltage power],"[voltage, power]",[],"[voltage, power]"
9,-Artificial-Neural-Network,[artificial neural network],[],"[neural network, machine learning, classificat...","[neural network, machine learning, classificat..."


### 4.3. save as json file
* save as `keywords_dict_final.json`

In [147]:
# keywords_dict_final = dict(zip(dict_keys, df_dict_sum["1_or_2_or_3"].values))
keywords_dict_final = dict(zip(dict_keys, df_dict_sum["2_or_3"].values))
with open("keywords_dict_final.json", "w") as j:
    json.dump(keywords_dict_final, j, ensure_ascii=False, indent=2) 

## 5. Create WOS-format file

In [148]:
WOSfilename = "BIPV_ML_all.txt"
outfilename = "BIPV_ML_nltk.txt"
kwfilename = "keywords_dict_final.json"

manualfilename = "keywords_manual.json"
with open("keywords_manual.json", "r") as j:
    keywords_manual = json.load(j)   
manual_keys = list(keywords_manual.keys())

### 5.1 combine with other data
### 5.2. extract keywords from abstract, too

In [149]:
count = 0
# keywords_out_dict = {}

with open(WOSfilename, "r") as infile:
    with open(outfilename, "w") as outfile:
        line = " "
        
        while line != "EF":
            line = infile.readline()
            if line[:3] == "DE ":
                line_DE = line
                line_ID = infile.readline()
                line_AB = infile.readline()
                
                count += 1
            
                # 1. Read from dictionary (keywords_dict_final.json)
                kws_in = line_DE[3:].rstrip("\n").split("; ")
                kws_out = []
                for kw_in in kws_in:
                    try:
                        keyword = keywords_dict_final[kw_in]
                        keyword = [re.sub(r'\([^)]*\)', "", keyword_).rstrip(' ').lstrip(' ').lower() for keyword_ in keyword]
                        kws_out += [k for k in keyword if len(k)>1]
                    except:
                        pass
#                         print(count, kw_in)
                
                # 2. Add keywords in relative dictionary
                kws_out_ = deepcopy(kws_out)
                for kw_out in kws_out:
                    for manual_key in manual_keys:
                        if manual_key.lower() in kw_out:
                            manual_value = keywords_manual[manual_key]
                            # from knowledge
                            kn_values = dfs(keywords_knowledge_init, manual_value)
                            kws_out_ += [k for k in kn_values if len(k)>1]
                
                # 3. Keyword mining from abstract
                for manual_key in manual_keys:
                    if manual_key.lower() in line_AB:
                        manual_value = keywords_manual[manual_key]
                        # from knowledge
                        kn_values = dfs(keywords_knowledge_init, manual_value)
                        kws_out_ += [k for k in kn_values if len(k)>1]
                
                kws_out_ = np.unique(kws_out_)
                kws_out = deepcopy(kws_out_)
                for kw_out in kws_out_:
                    for manual_key in manual_keys:
                        if manual_key.lower() in kw_out:
                            kws_out = [k for k in list(set(kws_out) | set(keywords_manual[manual_key])) if len(k)>1]
                
                # 4. cleaning bad keywords
                if 'none' in kws_out:
                    list(kws_out).remove("none")                
                
                # 5. write on outfile
                DE_out = "DE " + "; ".join(kws_out) + "\n"
                outfile.write(DE_out)
                outfile.write(line_ID)
                outfile.write(line_AB)
            else:
                outfile.write(line)

                
# with open("keywords_nltk_out.json", "w") as j:
#     json.dump(keywords_out, j, sort_keys=True, indent=2)
