# Keywords --- Text nesting

In [1]:
#
# Data loading
#
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"

pdf = pd.read_json(url + "keywords-text-nesting.json", orient="records", lines=True)

In [2]:
#
# Number of unique keywords
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

1009


In [3]:
#
# Substrings in keywords.
# This step requires several minutes to run
#
from techminer import text_nesting

tn = text_nesting(pdf.keywords_cleaned, sep=';', max_distance=1, transformer=lambda x: x.lower())

#
# Creates a thesaurus with candidate substrings as a thesaurus
#
with open('thesaurus-text-nesting-raw.json', 'w') as f:
    f.write(tn.__repr__())

!head -n 30 thesaurus-text-nesting-raw.json

{
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "Boruta algorithm",
    "Classification algorithm",
    "Clustering algorithms",
    "Immune algorithms",
    "Learning algorithms",
    "Learning-based algorithms",
    "Levenberg-Marquardt algorithm",
    "NARX algorithm",
    "State-of-the-art algorithms",
    "algorithms",
    "genetic algorithms",
    "hybrid algorithms"
  ],
  "arima": [


**Note**. Previous file MUST be reviewed and edited by the analyst. The edited versio is called `thesaurus-text-nesting-edited.json.json`and we load our version from GitHub.

In [4]:
#
# Reads the cleaned file from GitHub Repo
#
import requests

text = requests.get(url + "thesaurus-text-nesting-edited.json").text

#
# Loads the file
#
dictionary = json.loads(text)

#
# Create a new thesaurus
#
from techminer import Thesaurus

th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus to the keywords
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: th.apply(x, sep=";")
)

#
# Remove extra blanks between keywords  
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: x if x != "" else None
)

In [5]:
#
# Number of unique keywords
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

678


In [6]:
pdf.to_json("keywords-record-deletion.json", orient="records", lines=True)