In [1]:
# Install latest version from GitHub
!pip install -q -U git+https://github.com/jdvelasq/techminer

In [2]:
import pandas as pd

from techminer import DataFrame

#
# Data loading
#
df = DataFrame(
    pd.read_json(
        "https://raw.githubusercontent.com/jdvelasq/techminer/master/data/tutorial/"
        + "cleaned-data.json",
        orient="records",
        lines=True,
    )
)

# Direct queries over data

Some directec queries can be made directly over the dataframe without recurring to special functions or code.

In [3]:
#
# Number of records in the dataframe
#
len(df)

145

In [4]:
#
# Columns of the dataframe
#
df.columns

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'DOI', 'Affiliations', 'Document Type', 'Access Type', 'Source', 'EID',
       'Abstract', 'Author Keywords', 'Index Keywords', 'References',
       'keywords', 'CONF', 'fingerprint', 'keywords_cleaned', 'ID',
       'top_10_Authors_freq', 'top_10_keywords_freq',
       'top_10_Source_title_freq', 'top_10_Authors_cited_by',
       'top_10_keywords_cited_by', 'top_10_Source_title_cited_by'],
      dtype='object')

In [5]:
#
# Data coverage
#
df.coverage()

Unnamed: 0,Column,Number of items,Coverage (%)
0,Authors,145,100.00%
1,Author(s) ID,145,100.00%
2,Title,145,100.00%
3,Year,145,100.00%
4,Source title,145,100.00%
5,Volume,98,67.59%
6,Issue,27,18.62%
7,Art. No.,49,33.79%
8,Page start,120,82.76%
9,Page end,120,82.76%


In [6]:
#
# Number of terms 
#
df.count_report()

Unnamed: 0,Column,Number of items
0,Authors,434
1,Author(s) ID,434
2,Source title,103
3,Author Keywords,407
4,Index Keywords,884


In [7]:
#
# Number of terms for individual columns
#
df.count_terms('Author Keywords')

407

In [8]:
#
# Top N most cited documents
#
df.most_cited_documents().head(10)

AttributeError: module 'numpy' has no attribute 'isna'

In [None]:
#
# Or
#
df.citations_by_term('Title').head(10)

In [None]:
#
# Most cited authors
#
df.most_cited_authors().head()

## Record extraction by IDs

In [None]:
#
# IDs for top five documents
#
IDs = df.citations_by_term('Title')['ID'].head(5)
IDs

In [None]:
#
# Selects `Title` and `Authors` by IDs
#
df.get_rows_by_IDs(IDs)[['Title', 'Authors']]