In [1]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

# Keywords --- Text nesting

In [2]:
#
# Data loading
#
import pandas as pd

df = pd.read_json(
    "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"
    + "keywords-text-nesting.json",
    orient="records",
    lines=True,
)

In [3]:
#
# Number of unique keywords
#
len(
    set(
        [w.strip() for x in df.keywords_cleaned if x is not None for w in x.split(";")]
    )
)

1009

In [4]:
#
# Substrings in keywords.
# This step requires several minutes to run
#
from techminer import text_nesting

tn = text_nesting(df.keywords_cleaned, sep=';', max_distance=1, transformer=lambda x: x.lower())

#
# Creates a thesaurus with candidate substrings as a thesaurus
#
with open('thesaurus-text-nesting-raw.json', 'w') as f:
    f.write(tn.__repr__())

!head -n 30 thesaurus-text-nesting-raw.json

{
  "adaboost algorithm": [
    "AdaBoost algorithm",
    "validating AdaBoost algorithm"
  ],
  "adaptive noise": [
    "Adaptive noise",
    "adaptive noise reducer"
  ],
  "algorithmic trading": [
    "Algorithmic trading",
    "Algorithmic trading models"
  ],
  "algorithms": [
    "Algorithmic approach",
    "Analysis algorithms",
    "Boruta algorithm",
    "Classification algorithm",
    "Clustering algorithms",
    "Immune algorithms",
    "Learning algorithms",
    "Learning-based algorithms",
    "Levenberg-Marquardt algorithm",
    "NARX algorithm",
    "State-of-the-art algorithms",
    "algorithms",
    "genetic algorithms",
    "hybrid algorithms"
  ],
  "arima": [


**Note**. Previous file MUST be reviewed and edited by the analyst. The edited versio is called `thesaurus-text-nesting-edited.json.json`and we load our version from GitHub.

In [5]:
#
# Reads the cleaned file from GitHub Repo
#
import requests

text = requests.get(url + "thesaurus-text-nesting-edited.json").text

#
# Loads the file
#
dictionary = json.loads(text)

#
# Create a new thesaurus
#
from techminer import Thesaurus

th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus to the keywords
#
df["keywords_cleaned"] = df.keywords_cleaned.map(
    lambda x: th.apply(x, sep=";")
)

#
# Remove extra blanks between keywords  
#
df["keywords_cleaned"] = df.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
df["keywords_cleaned"] = df.keywords_cleaned.map(
    lambda x: x if x != "" else None
)

In [6]:
#
# Number of unique keywords
#
len(
    set(
        [w.strip() for x in df.keywords_cleaned if x is not None for w in x.split(";")]
    )
)

678

In [7]:
### df.to_json("keywords-record-deletion.json", orient="records", lines=True)