In [None]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

# Keyword completation

## Data loading

In this step, the records for the study are selected. The previous file is loaded with:

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"
df = pd.read_json(url + "keyword-completation.json", orient="records", lines=True)

`NaN` values are changed by `None`.

In [2]:
df = df.applymap(lambda x: None if pd.isna(x) is True else x)

## Keywords completation

This step aims to create a column (field) in the dataframe containing key terms for document selection. The columns `'Author Keywords'` and `'Index Keywords'` are joined in a new column is called `'Keywords'`.

In [3]:
from techminer import DataFrame

df = DataFrame(df).keywords_fusion()

In [4]:
df.Keywords

0      Component trends;Empirical mode decomposition;...
1      Consumer price index;Costs;Distributed represe...
2      Algorithms;And financial time series predictio...
3      Artificial intelligence;Auto-regressive exogen...
4      Commerce;Deep learning;Electronic trading;Fina...
                             ...                        
147                                                 None
148                                                 None
149                                                 None
150                                                 None
151                                                 None
Name: Keywords, Length: 152, dtype: object

However, there are records without `'Author Keywords'` and `'Index Keywords'`.

In [5]:
len(df[df.Keywords.map(lambda x: x is None)])

8

In [6]:
#
# Verification:
#
df.Keywords[
    (df["Author Keywords"].map(lambda x: x is None))
    & (df["Index Keywords"].map(lambda x: x is None))
]

144    None
145    None
146    None
147    None
148    None
149    None
150    None
151    None
Name: Keywords, dtype: object

In [7]:
df = df.Keywords_completation()

AttributeError: 'DataFrame' object has no attribute 'Keywords_completation'

In [None]:
#
# Verify the number of rows without keywords
#
len(df[df.keywords.map(lambda x: x is None)])

In the following code, a `Keywords` object is created. The content of column `keywords` is added to the object.

In [None]:
from techminer.keywords import Keywords

kyw = Keywords()
kyw.add_keywords(df.Keywords, sep=";")
kyw.keywords[0:20]

In [None]:
#
# Number of records without abstract
#
len(df[df.Abstract.map(lambda x: x is None)])

In [None]:
#
# Number of rows without title
#
len(df[df["Title"].map(lambda x: x is None)])

In [None]:
#
# Remove copyright character from abstract
#
pdf["Abstract"] = pdf.Abstract.map(
    lambda x: x[0 : x.find("\u00a9")]
    if isinstance(x, str) and x.find("\u00a9") != -1
    else x
)

In [None]:
#
# We combine title and abstract in a variable
#
title_abstract = pdf["Title"] + " " + pdf["Abstract"]

In [None]:
#
# Extracts previous recorded keywords using the Keywords object.
#
keywords_in_title_and_abstract = title_abstract.map(
    lambda x: kyw.extract_from_text(x, sep=";")
)

In [None]:
#
# Adds the new keywords only to rows without keywords
#
idx = pdf.keywords.map(lambda x: x is None)
pdf.loc[idx, "keywords"] = keywords_in_title_and_abstract[idx]

In [None]:
#
# Verify the number of rows without keywords
#
len(pdf[pdf.keywords.map(lambda x: x is None)])