In [1]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

In [2]:
from techminer import DataFrame

In [3]:
import pandas as pd

df = pd.read_csv(
    "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/"
    + "ieee-latin-america.csv"
)

df = df.applymap(lambda x: None if pd.isna(x) is True else x)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Authors                    3553 non-null   object 
 1   Author(s) ID               3553 non-null   object 
 2   Title                      3553 non-null   object 
 3   Year                       3553 non-null   int64  
 4   Source title               3553 non-null   object 
 5   Volume                     3553 non-null   int64  
 6   Issue                      3553 non-null   int64  
 7   Art. No.                   3067 non-null   float64
 8   Page start                 3525 non-null   float64
 9   Page end                   3498 non-null   float64
 10  Page count                 11 non-null     float64
 11  Cited by                   2452 non-null   float64
 12  DOI                        3538 non-null   object 
 13  Link                       3553 non-null   objec

In [4]:
#
# Number of records
#
len(df)

3553

In [5]:
#
# Keywords merging
#
df = DataFrame(df).keywords_fusion()
df.Keywords.head()

0    3D modeling;Bottleneck identifications;Computa...
1    Combinated Models;Combinatorial Network of Dyn...
2    Accident prevention strategies;Critical system...
3    Distribution system;Distribution systems;Elect...
4    Assistive Technology;Contact surface;Digital s...
Name: Keywords, dtype: object

In [6]:
#
# Number of records without Keywords
#
len(df[df.Keywords.map(lambda x: x is None)])

25

In [7]:
#
# Titles of documents without keywords
#
df[df.Keywords.map(lambda x: x is None)]['Title']

84      Guest Editorial Special Issue on Embedded Systems
99         Editorial IEEE Latin America Transactions 2020
310     Guest Editorial Special Issue on Embedded Systems
374        Editorial IEEE Latin America transactions 2019
480                        Editorial to the regular issue
702     The SimuroSot Strategy Development Kit: A high...
738     Erratum: Towards a methodology of business pro...
772                        Editorial to the regular issue
1144                       Editorial to the regular issue
2767    IEEE Latin America transactions volume 10 Issu...
3107    7th International Information and Telecommunic...
3112    13th Conference on Software Engineering and Da...
3192    12th Conference of Software Engineering and Da...
3210    Iberoamerican Workshop on Requirements Enginee...
3211    4th International Workshop on Practical Applic...
3287    6th Conference on Telematics Engineering, JITE...
3310    Congress of Electronics, Robotics and Automoti...
3312    11th C

In [8]:
#
# Delete of records without keywords
#
df = df[df.Keywords.map(lambda x: x is not None)]

In [9]:
#
# Number of selected records
#
len(df)

3528

In [10]:
#
# Text clustering of keywords.
#   Keyword strings with the same number of words
#
from techminer import Thesaurus, text_clustering

thesaurus = text_clustering(df.Keywords, sep=";", transformer=lambda x: x.lower())
with open("thesaurus-text-clustering-raw.json", "w") as f:
    f.write(thesaurus.__repr__())

In [None]:
import json

#
#   Loads the new dictionary
#
with open("thesaurus-text-clustering-cleaned.json", "r") as f:
    dictionary = json.loads(f.read())
    
#
#   Loads the new thesaurus
#
thesaurus = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
#   Clean the Keywords
#
thesaurus.compile()
df['Keywords'] = df.Keywords.map(lambda x: thesaurus.apply_as_dict(x, sep=";"))
# thesaurus_dict = thesaurus.to_dict()
# df['Keywords'] = df.Keywords.map(lambda x: ';'.join([thesaurus_dict[w] if w in thesaurus_dict.keys() else w for w in x.split(';')]))


#
#   Remove extra blanks between keywords if exists
#
df["Keywords"] = df.Keywords.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
#   Replace empty strings by None
#
df["Keywords"] = df.Keywords.map(lambda x: x if x != "" else None)

#
#   Number of unique of strings
#
len(
    set([w.strip() for x in df.Keywords if x is not None for w in x.split(";")])
)

In [None]:
#
# Text nesting
#
from techminer import text_nesting

thesaurus = text_nesting(df.Keywords, sep=';', max_distance=1, transformer=lambda x: x.lower())

#
# Creates a thesaurus with candidate substrings as a thesaurus
#
with open('thesaurus-text-nesting-raw.json', 'w') as f:
    f.write(tn.__repr__())
    


In [None]:
#
#   Loads the new dictionary
#
with open("thesaurus-text-nesting-cleaned.json", "r") as f:
    dictionary = json.loads(f.read())
    
#
#   Loads the new thesaurus
#
thesaurus = Thesaurus(dictionary, ignore_case=False, full_match=True, use_re=False)

#
# Apply the thesaurus
#
df["Keywords"] = df.Keywords.map(
    lambda x: thesaurus.apply(x, sep=";")
)

#
# Remove extra blanks between keywords
#
df["Keywords"] = df.Keywords.map(
    lambda x: ";".join(set([w.strip() for w in x.split(";")]))
)

#
# Replace empty strings by None
#
df["Keywords"] = df.Keywords.map(
    lambda x: x if x != "" else None
)


#
# Number of unique keywords
#
len(
    set(
        [w.strip() for x in df.Keywords if x is not None for w in x.split(";")]
    )
)

In [None]:
#
# A column for identify each record is added to the dataframe.
#
from techminer import DataFrame

df = DataFrame(df).generate_ID()

In [None]:
#
# Number of terms
#
df.count_report()

In [None]:
#
# Author desambiguation
#
df = DataFrame(df).disambiguate_authors()

In [None]:
#
# Top 10 most frequent authors
#
df.documents_by_term('Authors').head(10).Authors

In [None]:
#
# Top 10 most cited authors
#
df.citations_by_term('Authors').head(10).Authors

In [None]:
#
# Top 10 most frequent keywords
#
df.documents_by_term("Keywords", sep=";").head(10)

In [None]:
#
# Top 10 most cited keywords
#
df.citations_by_term("Keywords", sep=";").head(10)

In [None]:
#
# Documents by year
#
from techminer import Plot, heatmap

Plot(df.documents_by_year()).bar(cmap=plt.cm.Blues)

In [None]:
#
# Cumulative number of documents by year
#
Plot(df.documents_by_year(cumulative=True)).barh()

In [None]:
#
# Worldmap with the number of documents by country
#
plt.figure(figsize=(12,5))
Plot(df.documents_by_term("Country", sep=";")).worldmap()
plt.show()