# Keywords --- Substring clustering

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"

pdf = pd.read_json(url + "keywords-substring-clustering.json", orient="records", lines=True)

## Substrings clustering

In [11]:
#
# Substrings in keywords.
# This step requires several minutes to run
#
from techminer import text_nesting

tn = text_nesting(pdf.keywords_cleaned, sep=';', max_distance=1, transformer=lambda x: x.lower())

#
# Creates a thesaurus with candidate substrings as a thesaurus
#
with open('thesaurus-text-nesting-raw.json', 'w') as f:
    f.write(tn.__repr__())

!head -n 30 thesaurus-text-nesting-raw.json

{
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "Boruta algorithm",
    "Classification algorithm",
    "Clustering algorithms",
    "Immune algorithms",
    "Learning algorithms",
    "Learning-based algorithms",
    "Levenberg-Marquardt algorithm",
    "NARX algorithm",
    "State-of-the-art algorithms",
    "algorithms",
    "genetic algorithms",
    "hybrid algorithms"
  ],
  "arima": [
    "ARIMA",
    "arima modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial Intelligence (AI)",
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural networks": [
    "Artificial Neural Network",
    "Arti

**Note**. Previous file MUST be reviewed and edited by the analyst. The edited versio is called `thesaurus-text-nesting-raw.json.json`and we load our version from GitHub.

In [4]:
#
# Reads the cleaned file from GitHub Repo
#
text = requests.get(url + "thesaurus-text-nesting-cleaned.json.json").text

#
# Loads the file
#
dictionary = json.loads(text)

#
# Create a new thesaurus
#
th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus to the keywords
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: th.apply(x, sep=";")
)

#
# Remove extra blanks between keywords  
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: x if x != "" else None
)

In [16]:
#
# Number of unique keywords
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

765


## Deletion based on keywords or other text

In [17]:
from techminer.keywords import Keywords

kyw = Keywords()
kyw.add_keywords(["Vacuum", "market data"])
kyw

[
  "Vacuum",
  "market data"
]
ignore_case=True, full_match=False, use_re=False

In [18]:
"vacuum" in kyw

True

In [19]:
idx = pdf.keywords_cleaned.map(lambda x: not kyw.common(x, sep=";"))

In [20]:
idx[0:20]

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
Name: keywords_cleaned, dtype: bool

In [23]:
print("Records before = ", len(pdf))
pdf = pdf[idx]
print("Records after = ", len(pdf))

Records before =  145
Records after =  144


In [24]:
pdf.to_json("manual-review.json", orient="records", lines=True)