In [1]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

# Keyword completation

## Data loading

In this step, the records for the study are selected. The previous file is loaded with:

In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"
df = pd.read_json(url + "keyword-completation.json", orient="records", lines=True)

`NaN` values are changed by `None`.

In [3]:
df = df.applymap(lambda x: None if pd.isna(x) is True else x)

## Keywords completation

This step aims to create a column (field) in the dataframe containing key terms for document selection. The columns `'Author Keywords'` and `'Index Keywords'` are joined in a new column is called `'Keywords'`.

In [4]:
df = df.assign(Keywords=df["Author Keywords"] + ";" + df["Index Keywords"])

In [8]:
df['Author Keywords']

0      Component trends; Empirical mode decomposition...
1                                                   None
2      And financial time series prediction; Dynamic ...
3      Artificial intelligence; Deep learning; Financ...
4                                                   None
                             ...                        
147                                                 None
148                                                 None
149                                                 None
150                                                 None
151                                                 None
Name: Author Keywords, Length: 152, dtype: object

In [None]:
df.Keywords = df.Keywords.map(lambda x: ';'.join([   for e in x.split(';') ]))

In [5]:
None + ';'

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [3]:
pdf = pdf.assign(
    keywords=pdf["Author Keywords"].map(lambda x: x.split(";") if x is not None else [])
    + pdf["Index Keywords"].map(lambda x: x.split(";") if x is not None else [])
)

#  remove blank spaces sorounding keywords
pdf["Keywords"] = pdf["Keywords"].map(lambda x: [e.strip() for e in x])

#  join keywords in a new string
pdf["Keywords"] = pdf["Keywords"].map(lambda x: ";".join(x))

# converts in None empty keywords list
pdf["Keywords"] = pdf.Keywords.map(lambda x: None if x == "" else x)

In [4]:
pdf.keywords.head()

0    Component trends;Empirical mode decomposition;...
1    Earnings;Financial data processing;Information...
2    And financial time series prediction;Dynamic n...
3    Artificial intelligence;Deep learning;Financia...
4    Commerce;Deep learning;Electronic trading;Fina...
Name: keywords, dtype: object

However, there are records without `'Author Keywords'` and `'Index Keywords'`.

In [5]:
len(pdf[pdf.keywords.map(lambda x: x is None)])

8

In [6]:
# Verification:

pdf.keywords[
    (pdf["Author Keywords"].map(lambda x: x is None))
    & (pdf["Index Keywords"].map(lambda x: x is None))
]

144    None
145    None
146    None
147    None
148    None
149    None
150    None
151    None
Name: keywords, dtype: object

In the following code, a `Keywords` object is created. The content of column `keywords` is added to the object.

In [7]:
from techminer.keywords import Keywords

kyw = Keywords()
kyw.add_keywords(pdf.keywords, sep=";")
kyw.keywords[0:20]

['(2D) 2 PCA',
 '(2D) <sup>2</sup> PCA',
 'AMAPE',
 'ANN',
 'ARIMA',
 'ARIMA Model',
 'ARIMA model',
 'ARIMA modeling',
 'Absolute values',
 'Abstract representation',
 'Accounts receivable',
 'Accuracy Improvement',
 'Accuracy of classifications',
 'Accurate prediction',
 'Activation layer',
 'AdaBoost algorithm',
 'Adam Optimizer',
 'Adaptive boosting',
 'Adaptive gradient algorithm',
 'Adaptive noise']

In [8]:
#
# Number of records without abstract
#
len(pdf[pdf.Abstract.map(lambda x: x is None)])

0

In [9]:
#
# Number of rows without title
#
len(pdf[pdf["Title"].map(lambda x: x is None)])

0

In [10]:
#
# Remove copyright character from abstract
#
pdf["Abstract"] = pdf.Abstract.map(
    lambda x: x[0 : x.find("\u00a9")]
    if isinstance(x, str) and x.find("\u00a9") != -1
    else x
)

In [11]:
#
# We combine title and abstract in a variable
#
title_abstract = pdf["Title"] + " " + pdf["Abstract"]

In [12]:
#
# Extracts previous recorded keywords using the Keywords object.
#
keywords_in_title_and_abstract = title_abstract.map(
    lambda x: kyw.extract_from_text(x, sep=";")
)

In [13]:
#
# Adds the new keywords only to rows without keywords
#
idx = pdf.keywords.map(lambda x: x is None)
pdf.loc[idx, "keywords"] = keywords_in_title_and_abstract[idx]

In [14]:
#
# Verify the number of rows without keywords
#
len(pdf[pdf.keywords.map(lambda x: x is None)])

0