In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
import json, re, nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from nltk.stem.wordnet import WordNetLemmatizer
Lem = WordNetLemmatizer()

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("talk")

In [3]:
scopus = pd.read_csv("scopus.csv")
keyword_org_name = "keywords.txt"
keyword_abb_name = "keyword_abb.json"
keyword_dash_name = "keywords_dash.txt"
keyword_dict_name = "keywords_dict.json"
keyword_single_name = "keywords_single.txt"
keyword_plural_name = "keywords_plural.json"
unicode_name = "unicode_json"

In [4]:
# unicode dictionary
with open('unicode_dict.json') as j:
    unicode_dict = json.load(j)

In [5]:
keywords = []
line = ''

with open("BIPV_ML_all.txt") as f:
    while line != "EF":
        line = f.readline()
        if line[:3] == "DE ":
            keywords_new = line[3:].rstrip('\n').split(";")
            for keyword_new in keywords_new:
                keywords += [keyword_new.rstrip(' ').lstrip(' ')]

keywords = sorted(keywords)
with open(keyword_org_name, "w", encoding="utf-8-sig") as f:
    for keyword in keywords:
        f.write(f"{keyword}\n")

In [66]:
# retreive keywords
keywords_raw_ = np.genfromtxt("keywords.txt", dtype="str", delimiter="\n", encoding='utf-8-sig') #utf-8-sig for \ufeff removal

# get unique ones
keywords_raw_ = np.unique(keywords_raw_)

# abbriviations detection (key candidates)
keywords_abb = {}
keywords_abb_key_cand = []
for keyword_raw_ in keywords_raw_:
    kwr = keyword_raw_.split()
    for kw in kwr:
        kw = kw.lstrip("(").rstrip(")")
        if (kw == kw.upper()) \
           and (kw.lstrip("(").rstrip(")") not in keywords_abb_key_cand) \
           and (ord(kw[0]) >= ord('A') and ord(kw[0]) <= ord('Z')) \
           and (len(kw) > 1):
            keywords_abb_key_cand.append(kw.lstrip("(").rstrip(")").rstrip(":").rstrip(","))

# convert to lower cases
keywords_raw = [kw.lower() for kw in keywords_raw_]

# duplicate for values
keywords_value = deepcopy(keywords_raw)

### keywords cleaning
keywords_dash = [] # additional dictionary, for words containing '-'

# remove staring characters
keywords_value = [kw.lstrip('-') for kw in keywords_value]
keywords_value = [kw.rstrip('-') for kw in keywords_value]
keywords_value = [kw.lstrip(':') for kw in keywords_value]
keywords_value = [kw.rstrip(':') for kw in keywords_value]

# remove brakets "()"
for kc in keywords_abb_key_cand:
    kc_ = kc.lower()
    for kw in keywords_value:
        if kc_ in kw.replace('(', "").replace(')', "").split():
            kc_value = re.sub(r'\([^)]*\)', "", kw).rstrip(' ').lstrip(' ')
            if (kc_ not in kc_value) and \
               (kc_value[-1] != ")") and \
               (len(kc_value.split()) > 1) and \
               ((kc not in list(keywords_abb.keys())) or (kc in list(keywords_abb.keys()) and len(kc_value) < len(keywords_abb[kc]) and len(kc_value.split()) > 1)):
                kc_value = kc_value.replace("  ", " ")
                kc_value = ' '.join(kc_value.split(' ')[:-1] + [Lem.lemmatize(kc_value.split(' ')[-1])])
                keywords_abb.update({kc:kc_value})

keywords_value = [re.sub(r'\([^)]*\)', "", kw).rstrip(' ').lstrip(' ').lstrip(')').lstrip('(') for kw in keywords_value]

# remove unicodes –
unicode_keys = list(unicode_dict.keys())
for ukey in unicode_keys:
        keywords_value = [kw.replace(ukey, unicode_dict[ukey]) for kw in keywords_value]

# remove mathematics
keywords_value = [kw.replace("\\infty", "infinity") for kw in keywords_value]
keywords_value = [kw.replace("\\mathrm{", "") for kw in keywords_value]
keywords_value = [kw.replace("{", "") for kw in keywords_value]
keywords_value = [kw.replace("}", "") for kw in keywords_value]

# remove '"'
keywords_value = [kw.replace('"', "") for kw in keywords_value]

# reduce needless blanks.
keywords_value = [re.sub("\s+", " ", kw) for kw in keywords_value]
keywords_value = [kw.replace(" -", "-") for kw in keywords_value]
keywords_value = [kw.replace("- ", "-") for kw in keywords_value]
keywords_value = [kw.replace(" //", "//") for kw in keywords_value]
keywords_value = [kw.replace("// ", "//") for kw in keywords_value]

# create single-word keyword
keywords_single = []
for kw in keywords_value:
    if len(kw.split()) == 1 and len(kw) <= 4:
        keywords_single.append(kw)

# Convert plural to singular
Lem = WordNetLemmatizer()
keywords_plural = {}
keywords_value_ = []
for i, kw in enumerate(keywords_value):
    if (kw not in keywords_single) and (len(kw.split(' ')) > 1):
        kw_ = ' '.join(kw.split(' ')[:-1] + [Lem.lemmatize(kw.split(' ')[-1])])
        keywords_value_.append(kw_)
    else:
        keywords_value_.append(kw)
        
    kw_ = kw.split(' ')
    for kw__ in kw_:
        if kw__ not in keywords_single and Lem.lemmatize(kw__) != kw__:
            keywords_plural.update({kw__: Lem.lemmatize(kw__)})

keywords_value = deepcopy(keywords_value_)
        
    
# keywords_value = [' '.join(kw.split(' ')[:-1] + [Lem.lemmatize(kw.split(' ')[-1])] if len(kw.split(' ')) > 1 and not in keywords_single else kw for kw in keywords_value]

keywords_value = [kw.lstrip(' ').rstrip(' ') for kw in keywords_value]
    
# create dash dictionary
for kw in keywords_value:
    kw_ = kw.split(" ")
    for kw__ in kw_:
        if ('-' in kw__) or ('//' in kw__):
            keywords_dash.append(kw__)
    
keywords_dash = np.unique(keywords_dash)

# create dictionary
keywords_dict = dict(zip(keywords_raw, keywords_value))
with open(keyword_dict_name, "w") as j:
    json.dump(keywords_dict, j, ensure_ascii=False, indent=2)
    
# create plural-singular dictionary
plural_sorted = dict(sorted(keywords_plural.items()))
with open(keyword_plural_name, "w") as j:
    json.dump(plural_sorted, j, ensure_ascii=False, indent=2)

# create abbreviation dictionary
abb_sorted = dict(sorted(keywords_abb.items()))
with open(keyword_abb_name, "w") as j:
    json.dump(abb_sorted, j, ensure_ascii=False, indent=2)

# create "dash" keywords list
with open(keyword_dash_name, "w", encoding="utf-8-sig") as f:
    for keyword in keywords_dash:
        f.write(f"{keyword}\n")

# create "dash" keywords list
with open(keyword_single_name, "w", encoding="utf-8-sig") as f:
    for keyword in keywords_single:
        f.write(f"{keyword}\n")

* find words matching with "keywords_dash" with same order.

In [131]:
keywords_value_sort = np.unique(keywords_value, return_counts=True)

df_keywords_value = pd.DataFrame({"keyword": keywords_value_sort[0],
                                         "counts": keywords_value_sort[1],
                                        }).sort_values("keyword")
df_keywords_value.to_csv("keywords_value.csv", index=False)

In [154]:
A = ["a", "b", "c"]
B = "def b ebbzf c"

tmp = [B.index(a) for a in A]
tmp1 = sorted(tmp)
print(tmp, tmp1)

ValueError: substring not found

In [175]:
df_keywords_value = pd.read_csv("keywords_value.csv")

df_tmp = df_keywords_value.iloc[30:60].reset_index(drop=True)
df_tmp["relwords"] = np.nan

for i in range(30):
    words0 = df_tmp["keyword"].iloc[i]
    words1 = words0.split(" ")
    words2 = words0.split("-")
    
    for kws in keywords_dash:
        kws_ = kws.replace("-", "")
        if kws_ in words1:
            print("try01", [df_tmp["keyword"].iloc[i]], i, kws, tmp)

        
        
        kws_ = kws.split('-')
        try:
            tmp = [words1.index(kw_) for kw_ in kws_]
            if tmp == sorted(tmp) and (len(np.unique(tmp)) == len(tmp)):
                print("try1", [df_tmp["keyword"].iloc[i]], i, kws, tmp)
        except ValueError:
            try:
                tmp = [words2.index(kw_) for kw_ in kws_]
                if tmp == sorted(tmp) and (len(np.unique(tmp)) == len(tmp)):
                    print("try2", [df_tmp["keyword"].iloc[i]], i, kws, tmp)
            except ValueError:
                pass

        

try01 ['36-cell pv module'] 6 p-v [1, 1]
try1 ['36-cell pv module'] 6 pv-module [1, 2]
try01 ['3d'] 7 3-d [1, 2]
try02 ['3d'] 7 3-d [1, 2]
try01 ['3d building model'] 8 3-d [1, 2]
try01 ['3d city model'] 9 3-d [1, 2]
try01 ['3d density model'] 10 3-d [1, 2]
try01 ['3d experiment'] 11 3-d [1, 2]
try01 ['3d finite element'] 12 3-d [1, 2]
try1 ['3d finite element'] 12 finite-element [1, 2]
try01 ['3d finite-volume modeling'] 13 3-d [1, 2]
try01 ['3d gi'] 14 3-d [1, 2]
try01 ['3d model'] 15 3-d [1, 2]
try01 ['3d numerical model'] 16 3-d [1, 2]
try01 ['3d radiative transfer'] 17 3-d [1, 2]
try01 ['3d solar cell modeling'] 18 3-d [1, 2]
try01 ['3d solar city'] 19 3-d [1, 2]
try01 ['3d solar radiation model'] 20 3-d [1, 2]
try01 ['3d urban model'] 21 3-d [1, 2]
try02 ['3d-tlm'] 22 3-d [1, 2]
try2 ['3d-tlm'] 22 3d-tlm [0, 1]
try2 ['3l-anpc'] 23 3l-anpc [0, 1]
try2 ['4-terminal'] 24 4-terminal [0, 1]
try1 ['5-level single phase converter'] 27 single-phase [1, 2]


In [115]:
keywords_value_single = []
p = re.compile("[a-zA-Z]")

abbkeys_ = list(keywords_abb.keys())
abbkeys = [key.lower() for key in abbkeys]
pluralkeys = list(keywords_plural.keys())

for keyword_value in keywords_value:
    kws = keyword_value.replace(',', "").split(' ')
    for kw in kws:
        if p.match(kw) and (kw not in abbkeys) and (kw not in keywords_dash):
            keywords_value_single.append(kw)

keywords_value_single = np.unique(keywords_value_single, return_counts=True)

df_keywords_value_single = pd.DataFrame({"keyword": keywords_value_single[0],
                                         "counts": keywords_value_single[1],
                                         "relwords": [keywords_plural[kw] if kw in pluralkeys else kw for kw in keywords_value_single[0]]
                                        })
df_keywords_value_single[df_keywords_value_single["counts"]>5].to_csv("keywords_relwords.csv")

* manual operation on 

In [9]:
keywords_noun_dict = {}
keywords_noun_ = list(keywords_abb.values())