# Keywords --- Text clustering 

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"

pdf = pd.read_json(url + "keywords-text-clustering.json", orient="records", lines=True)

## Keywords with equal number of words

In [2]:
from techminer import Thesaurus, text_clustering

#
# Search of keywords with the same root.
#
th = text_clustering(pdf.keywords, sep=';', transformer=lambda x: x.lower())

#
# Save the data for manual review with a text editor.
#
with open('thesaurus-raw.json', 'w') as f:
    f.write(th.__repr__())
    
#
# Head of the file
#
!head -n 35 thesaurus-raw.json

{
  "algorithm": [
    "Algorithms",
    "algorithm",
    "algorithms"
  ],
  "ann": [
    "ANN",
    "ann"
  ],
  "anomaly detection": [
    "Anomaly detection",
    "anomaly detection"
  ],
  "arima modeling": [
    "ARIMA Model",
    "ARIMA model",
    "ARIMA modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural network": [
    "Artificial Neural Network",
    "Artificial Neural Networks",
    "Artificial neural network",
    "Artificial neural networks",
    "artificial neural network",
    "artificial neural networks"


In [3]:
#
#  Number of strings differing in keywords.
#
print(
    len(set([w.strip() for x in pdf.keywords if x is not None for w in x.split(";")]))
)

1210


**Note**. Previous file MUST be reviewed and edited by the analyst. The edited version is called `thesaurus-edited-text-clustering.json` and we load our version from GitHub.

In [4]:
#
# Reads the file from GitHub Repo.
#
import requests

text = requests.get(url + "thesaurus-edited-text-clustering.json").text

In [5]:
#
# Loads in json format
#
import json
dictionary = json.loads(text)

In [6]:
#
# Cleaning
#
from techminer import Thesaurus

#
# Creates a thesaurus
#
th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus to keywords
#
pdf["keywords_cleaned"] = pdf.keywords.map(lambda x: th.apply(x, sep=";"))

#
# Remove extra blanks between keywords  
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
pdf["keywords_cleaned"] = pdf.keywords_cleaned.map(lambda x: x if x != "" else None)

In [7]:
#
# Number of unique of strings
#
print(
    len(
        set(
            [
                w.strip()
                for x in pdf.keywords_cleaned
                if x is not None
                for w in x.split(";")
            ]
        )
    )
)

1009


In [8]:
### pdf.to_json("keywords-substring-clustering.json", orient="records", lines=True)