In [1]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

# Keywords --- Text clustering 

In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"
df = pd.read_json(url + "keywords-text-clustering.json", orient="records", lines=True)

In [3]:
df.columns

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'DOI', 'Affiliations', 'Document Type', 'Access Type', 'Source', 'EID',
       'Abstract', 'Author Keywords', 'Index Keywords', 'References',
       'Keywords', 'CONF', 'fingerprint'],
      dtype='object')

## Keywords with equal number of words

In [4]:
from techminer import Thesaurus, text_clustering

#
# Search of keywords with the same root.
#
th = text_clustering(df.Keywords, sep=';', transformer=lambda x: x.lower())

#
# Save the data for manual review with a text editor.
#
with open('thesaurus-text-clustering-raw.json', 'w') as f:
    f.write(th.__repr__())
    
#
# Head of the file
#
!head -n 35 thesaurus-text-clustering-raw.json

{
  "algorithm": [
    "Algorithms",
    "algorithm",
    "algorithms"
  ],
  "ann": [
    "ANN",
    "ann"
  ],
  "anomaly detection": [
    "Anomaly detection",
    "anomaly detection"
  ],
  "arima modeling": [
    "ARIMA Model",
    "ARIMA model",
    "ARIMA modeling"
  ],
  "article": [
    "ARTICLE",
    "Article",
    "article"
  ],
  "artificial intelligence": [
    "Artificial intelligence",
    "artificial intelligence"
  ],
  "artificial neural network": [
    "Artificial Neural Network",
    "Artificial Neural Networks",
    "Artificial neural network",
    "Artificial neural networks",
    "artificial neural network",
    "artificial neural networks"


In [5]:
#
#  Number of strings differing in keywords.
#
len(set([w.strip() for x in df.Keywords if x is not None for w in x.split(";")]))

1209

**Note**. Previous file MUST be reviewed and edited by the analyst. The edited version is called `thesaurus-text-clustering-edited.json` and we load our version from GitHub.

In [6]:
#
# Reads the file from GitHub Repo.
#
import requests

text = requests.get(url + "thesaurus-text-clustering-edited.json").text

In [7]:
#
# Loads in json format
#
import json

dictionary = json.loads(text)

In [8]:
#
# Cleaning
#
from techminer import Thesaurus

#
# Creates a thesaurus
#
th = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus to keywords
#
df["keywords_cleaned"] = df.Keywords.map(lambda x: th.apply(x, sep=";"))

#
# Remove extra blanks between keywords if exists
#
df["keywords_cleaned"] = df.keywords_cleaned.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
df["keywords_cleaned"] = df.keywords_cleaned.map(lambda x: x if x != "" else None)

In [9]:
#
# Number of unique of strings
#
len(
    set([w.strip() for x in df.keywords_cleaned if x is not None for w in x.split(";")])
)

1008

In [10]:
### df.to_json("keywords-text-nesting.json", orient="records", lines=True)