# Keywords cleanup

In [3]:
import pandas as pd

pdf = pd.read_json("data-02.json", orient="records", lines=True)

In [None]:
from techminer import DataFrame

rdf = RecordsDataFrame(pd.read_json("step-04.json", orient="records", lines=True))

## Keywords with equal number of words

In [4]:
from techminer import Thesaurus, text_clustering

#
# Search of keywords with the same root
#
th = text_clustering(pdf.keywords, sep=';', transformer=lambda x: x.lower())
with open('thesaurus-1.json', 'w') as f:
    f.write(th.__repr__())
!head -n 30 thesaurus-1.json

{
  "algorithm": [
    "Algorithms",
    "algorithm",
    "algorithms"
  ],
  "ann": [
    "ANN",
    "ann"
  ],
  "anomaly detection": [
    "Anomaly detection",
    "anomaly detection"
  ],
  "arima modeling": [
    "ARIMA Model",
    "ARIMA model",
    "ARIMA modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural network": [
    "Artificial Neural Network",


In [6]:
#
#  Number of strings differing in keywords
#
print(
    len(set([w.strip() for x in pdf.keywords if x is not None for w in x.split(";")]))
)

1210


In [7]:
#
# Reading of keywords edited by the analyst
#
import json

##
## cleaning
##
from techminer import Thesaurus

with open("thesaurus-1-edited.json", "r") as f:
    dictionary = json.loads(" ".join(f.readlines()))


th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)
pdf["keywords_cleaned"] = pdf.keywords.map(lambda x: th.apply(x, sep=";"))
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(lambda x: x if x != "" else None)

In [9]:
#
# Number unique of strings
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

1009


## Substrings clustering

In [11]:
#
# Substrings in keywords.
# This step requires several minutes to run
#
from techminer import text_nesting

tn = text_nesting(pdf.keywords_cleaned, sep=';', max_distance=1, transformer=lambda x: x.lower())
with open('thesaurus-2.json', 'w') as f:
    f.write(tn.__repr__())
!head -n 60 thesaurus-2.json

{
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "Boruta algorithm",
    "Classification algorithm",
    "Clustering algorithms",
    "Immune algorithms",
    "Learning algorithms",
    "Learning-based algorithms",
    "Levenberg-Marquardt algorithm",
    "NARX algorithm",
    "State-of-the-art algorithms",
    "algorithms",
    "genetic algorithms",
    "hybrid algorithms"
  ],
  "arima": [
    "ARIMA",
    "arima modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial Intelligence (AI)",
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural networks": [
    "Artificial Neural Network",
    "Arti

In [12]:
!head -n 60 thesaurus-2-edited.json

{
  "(2D) 2 PCA": [
    "(2D) 2 PCA",
    "(2D) <sup>2</sup> PCA"
  ],
  "Empirical research": [
    "Empirical research",
    "Empirical studies"
  ],
  "(2D) 2 PCA": [
    "(2D) 2 PCA",
    "(2D) <sup>2</sup> PCA"
  ],
  "Learning-based algorithms": [
    "Learning-based algorithms",
    "Learning-based approach",
    "Learning-based methods"
  ],
  "Classifiers": [
    "Classification Methods",
    "Classification algorithm",
    "Classification methods"
  ],
  "Forex": [
    "Forex (FX)",
    "Forex markets"
  ],
  "Elman neural network": [
    "Elman neural network",
    "Elman recurrent neural network",
    "Elman network"
  ],
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "State-of-the-art 

In [13]:
with open("thesaurus-2-edited.json", "r") as f:
    dictionary = json.loads(" ".join(f.readlines()))
th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

In [14]:
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: th.apply(x, sep=";")
)
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: x if x != "" else None
)

In [16]:
#
# Number of unique keywords
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

765


## Deletion based on keywords or other text

In [17]:
from techminer.keywords import Keywords

kyw = Keywords()
kyw.add_keywords(["Vacuum", "market data"])
kyw

[
  "Vacuum",
  "market data"
]
ignore_case=True, full_match=False, use_re=False

In [18]:
"vacuum" in kyw

True

In [19]:
idx = pdf.keywords_cleaned.map(lambda x: not kyw.common(x, sep=";"))

In [20]:
idx[0:20]

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
Name: keywords_cleaned, dtype: bool

In [23]:
print("Records before = ", len(pdf))
pdf = pdf[idx]
print("Records after = ", len(pdf))

Records before =  145
Records after =  144


In [24]:
pdf.to_json("data-03.json", orient="records", lines=True)