# Step 5: Keywords cleanup

In [1]:
import pandas as pd
from techminer import RecordsDataFrame

rdf = RecordsDataFrame(pd.read_json("step-04.json", orient="records", lines=True))

## Keywords with equal number of words

In [2]:
from techminer import Thesaurus, text_clustering

#
# Search of keywords with the same root
#
th = text_clustering(rdf['keywords'], sep=';', transformer=lambda x: x.lower())
with open('thesaurus-1.json', 'w') as f:
    f.write(th.__repr__())
!head -n 30 thesaurus-1.json

{
  "algorithm": [
    "Algorithms",
    "algorithm",
    "algorithms"
  ],
  "ann": [
    "ANN",
    "ann"
  ],
  "anomaly detection": [
    "Anomaly detection",
    "anomaly detection"
  ],
  "arima modeling": [
    "ARIMA Model",
    "ARIMA model",
    "ARIMA modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural network": [
    "Artificial Neural Network",


In [3]:
#
#  Number of strings differing in keywords
#
print(
    len(
        set([w.strip() for x in rdf["keywords"] if x is not None for w in x.split(";")])
    )
)

1211


In [4]:
#
# Reading of keywords edited by the analyst
#
import json

with open("thesaurus-1-edited.json", "r") as f:
    dictionary = json.loads(" ".join(f.readlines()))

##
## Limpieza
##
from techminer import Thesaurus

th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)
rdf["keywords (cleaned)"] = rdf["keywords"].map(lambda x: th.apply(x, sep=";"))
rdf["keywords (cleaned)"] = rdf["keywords (cleaned)"].map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)
rdf["keywords (cleaned)"] = rdf["keywords (cleaned)"].map(
    lambda x: x if x != "" else None
)

In [5]:
#
# Number unique of strings
#
print(
    len(
        set(
            [
                w.strip()
                for x in rdf["keywords (cleaned)"]
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

1010


## Substrings clustering

In [6]:
#
# Substrings in keywords
#
from techminer import text_nesting
tn = text_nesting(rdf['keywords (cleaned)'], sep=';', max_distance=1, transformer=lambda x: x.lower())
with open('thesaurus-2.json', 'w') as f:
    f.write(tn.__repr__())
!head -n 60 thesaurus-2.json

{
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "Boruta algorithm",
    "Classification algorithm",
    "Clustering algorithms",
    "Immune algorithms",
    "Learning algorithms",
    "Learning-based algorithms",
    "Levenberg-Marquardt algorithm",
    "NARX algorithm",
    "State-of-the-art algorithms",
    "algorithms",
    "genetic algorithms",
    "hybrid algorithms"
  ],
  "arima": [
    "ARIMA",
    "arima modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial Intelligence (AI)",
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural networks": [
    "Artificial Neural Network",
    "Arti

In [7]:
!head -n 60 thesaurus-2-edited.json

{
  "(2D) 2 PCA": [
    "(2D) 2 PCA",
    "(2D) <sup>2</sup> PCA"
  ],
  "Empirical research": [
    "Empirical research",
    "Empirical studies"
  ],
  "(2D) 2 PCA": [
    "(2D) 2 PCA",
    "(2D) <sup>2</sup> PCA"
  ],
  "Learning-based algorithms": [
    "Learning-based algorithms",
    "Learning-based approach",
    "Learning-based methods"
  ],
  "Classifiers": [
    "Classification Methods",
    "Classification algorithm",
    "Classification methods"
  ],
  "Forex": [
    "Forex (FX)",
    "Forex markets"
  ],
  "Elman neural network": [
    "Elman neural network",
    "Elman recurrent neural network",
    "Elman network"
  ],
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "State-of-the-art 

In [8]:
with open("thesaurus-2-edited.json", "r") as f:
    dictionary = json.loads(" ".join(f.readlines()))
th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

In [9]:
rdf["keywords (cleaned)"] = rdf["keywords (cleaned)"].map(
    lambda x: th.apply(x, sep=";")
)
rdf["keywords (cleaned)"] = rdf["keywords (cleaned)"].map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)
rdf["keywords (cleaned)"] = rdf["keywords (cleaned)"].map(
    lambda x: x if x != "" else None
)

In [10]:
#
# Number of unique keywords
#
print(
    len(
        set(
            [
                w.strip()
                for x in rdf["keywords (cleaned)"]
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

765


In [11]:
#
# Review
#
from techminer import display_records

display_records(rdf[["Title", "keywords (cleaned)"]].head(10))

-----------------------------------------------
Record index: 0
{
  "Title": "Improving DWT-RNN model via B-spline wavelet multiresolution to forecast a high-frequency time series",
  "keywords (cleaned)": "trends;Nonlinear autoregressive neural network;empirical mode decomposition;Moving average;long short-term memory neural network;Noise reduction;machine learning;time series forecasting;Meta-learning"
}
-----------------------------------------------
Record index: 1
{
  "Title": "Direct marketing campaigns in retail banking with the use of deep learning and random forests",
  "keywords (cleaned)": "Novel applications;Earnings;Tokyo Stock Exchange;costs;Consumer price index;Information science;stock prediction;Newsprint;long short-term memory neural network;Distributed representation;time series forecasting;financial data;Textual information"
}
-----------------------------------------------
Record index: 2
{
  "Title": "Combining time-series and textual data for taxi demand predicti

## Deleting based on keywords or other text

In [12]:
from techminer import display_records

display_records(rdf[["Title", "keywords (cleaned)"]].head(10))

-----------------------------------------------
Record index: 0
{
  "Title": "Improving DWT-RNN model via B-spline wavelet multiresolution to forecast a high-frequency time series",
  "keywords (cleaned)": "trends;Nonlinear autoregressive neural network;empirical mode decomposition;Moving average;long short-term memory neural network;Noise reduction;machine learning;time series forecasting;Meta-learning"
}
-----------------------------------------------
Record index: 1
{
  "Title": "Direct marketing campaigns in retail banking with the use of deep learning and random forests",
  "keywords (cleaned)": "Novel applications;Earnings;Tokyo Stock Exchange;costs;Consumer price index;Information science;stock prediction;Newsprint;long short-term memory neural network;Distributed representation;time series forecasting;financial data;Textual information"
}
-----------------------------------------------
Record index: 2
{
  "Title": "Combining time-series and textual data for taxi demand predicti

In [13]:
from techminer.keywords import Keywords

kyw = Keywords()
kyw.add_keywords(["Vacuum", "market data"])
kyw

[
  "Vacuum",
  "market data"
]
ignore_case=True, full_match=False, use_re=False

In [14]:
"vacuum" in kyw

True

In [15]:
idx = rdf["keywords (cleaned)"].map(lambda x: not kyw.common(x, sep=";"))

In [16]:
idx[0:20]

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
Name: keywords (cleaned), dtype: bool

In [17]:
print("Records before = ", len(rdf))
rdf = rdf[idx]
print("Records after = ", len(rdf))

Records before =  145
Records after =  144


In [18]:
rdf.to_json("step-05.json", orient="records", lines=True)