In [2]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
import plotly.express as px
import plotly.io as pio
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [3]:
pio.renderers.default = 'iframe'
pio.get_chrome()
utils.pd_set_options(cols=500)

In [4]:
wr.athena.read_sql_query("""
    SELECT 
        COUNT(*) AS number_of_papers,
        COUNT(DISTINCT subfield_index) AS number_of_subfields,
        COUNT(DISTINCT topic_index) AS number_of_topics
    FROM
        unified_works
""", '03_core')

Unnamed: 0,number_of_papers,number_of_subfields,number_of_topics
0,388735,11,302


In [5]:
df_subfield_stat = wr.athena.read_sql_query("""
SELECT 
    subfield_index, 
    subfield_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM unified_works), 2) AS percent_of_papers
FROM
    unified_works
GROUP BY
    subfield_index, subfield_display_name
ORDER BY
    number_of_papers DESC
 """, '03_core')
df_subfield_stat.head()

Unnamed: 0,subfield_index,subfield_display_name,number_of_papers,percent_of_papers
0,0,Artificial Intelligence,148548,38.21
1,1,Computer Vision and Pattern Recognition,63978,16.46
2,2,Information Systems,53891,13.86
3,3,Computer Networks and Communications,42035,10.81
4,4,Computational Theory and Mathematics,34092,8.77


In [6]:
fig = px.histogram(
    df_subfield_stat,
    x='subfield_display_name',
    y='number_of_papers',
    title='Number of papers per Subfield'
)
fig.update_layout(xaxis_title='')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [7]:
df_topic_stat = wr.athena.read_sql_query("""
SELECT 
    topic_index, 
    topic_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM unified_works), 2) AS percent_of_papers
FROM
    unified_works
GROUP BY
    topic_index, topic_display_name
ORDER BY
    number_of_papers DESC
 """, '03_core')
df_topic_stat.head(1000)

Unnamed: 0,topic_index,topic_display_name,number_of_papers,percent_of_papers
0,0,Topic Modeling,18594,4.78
1,1,Natural Language Processing Techniques,14575,3.75
2,2,Computational Drug Discovery Methods,11454,2.95
3,3,Quantum Information and Cryptography,8393,2.16
4,4,Quantum Computing Algorithms and Architecture,7213,1.86
5,5,Blockchain Technology Applications and Security,6448,1.66
6,6,Advanced Neural Network Applications,6081,1.56
7,7,AI in cancer detection,4854,1.25
8,8,Neural Networks and Applications,4523,1.16
9,9,IoT and Edge/Fog Computing,4473,1.15


In [38]:
fig = px.histogram(
    df_topic_stat,
    x='topic_display_name',
    y='number_of_papers',
    title='Number of papers per Topic'
)
fig.update_xaxes(showticklabels=False)
fig.update_layout(xaxis_title='Topics')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [14]:
df_semibalanced_by_topics = wr.athena.read_sql_query("""
WITH
filtered_by_min_topic_count AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_id ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
    WHERE
        openalex_primary_topic_count > 400 -- ~0.1% cutoff rate
),
stratified_sampling AS (
    SELECT
        *
    FROM
        filtered_by_min_topic_count
    WHERE
        topic_row_number < 3000 -- elbow-logic 
)
SELECT 
    COUNT(*) AS number_of_papers,
    COUNT(DISTINCT openalex_primary_topic_subfield_id) AS number_of_subfields,
    COUNT(DISTINCT openalex_primary_topic_id) AS number_of_topics
FROM
    stratified_sampling
""", '02_stg')
df_semibalanced_by_topics

Unnamed: 0,number_of_papers,number_of_subfields,number_of_topics
0,308153,11,211


In [16]:
df_subfield_semibalanced_stat = wr.athena.read_sql_query("""
WITH
filtered_by_min_topic_count AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_id ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
    WHERE
        openalex_primary_topic_count > 400 -- ~0.1% cutoff rate
),
stratified_sampling AS (
    SELECT
        *
    FROM
        filtered_by_min_topic_count
    WHERE
        topic_row_number < 3000 -- elbow-logic 
)
SELECT 
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    stratified_sampling
GROUP BY
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
df_subfield_semibalanced_stat.head()

Unnamed: 0,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,number_of_papers,percent_of_papers
0,1702,Artificial Intelligence,103665,33.64
1,1707,Computer Vision and Pattern Recognition,58171,18.88
2,1710,Information Systems,42779,13.88
3,1705,Computer Networks and Communications,36350,11.8
4,1703,Computational Theory and Mathematics,24741,8.03


In [17]:
fig = px.histogram(
    df_subfield_semibalanced_stat,
    x='openalex_primary_topic_subfield_display_name',
    y='number_of_papers',
    title='Number of papers per Subfield'
)
fig.update_layout(xaxis_title='')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [18]:
df_topic_semibalanced_stat = wr.athena.read_sql_query("""
WITH
filtered_by_min_topic_count AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_id ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
    WHERE
        openalex_primary_topic_count > 400 -- ~0.1% cutoff rate
),
stratified_sampling AS (
    SELECT
        *
    FROM
        filtered_by_min_topic_count
    WHERE
        topic_row_number < 3000 -- elbow-logic 
)
SELECT 
    openalex_primary_topic_id, 
    openalex_primary_topic_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    stratified_sampling
GROUP BY
    openalex_primary_topic_id, 
    openalex_primary_topic_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
df_topic_semibalanced_stat.head()

Unnamed: 0,openalex_primary_topic_id,openalex_primary_topic_display_name,number_of_papers,percent_of_papers
0,10764,Privacy-Preserving Technologies in Data,2999,0.97
1,10270,Blockchain Technology Applications and Security,2999,0.97
2,10036,Advanced Neural Network Applications,2999,0.97
3,10664,Sentiment Analysis and Opinion Mining,2999,0.97
4,10260,Software Engineering Research,2999,0.97


In [19]:
fig = px.histogram(
    df_topic_semibalanced_stat,
    x='openalex_primary_topic_display_name',
    y='number_of_papers',
    title='Number of papers per Topic'
)
fig.update_xaxes(showticklabels=False)
fig.update_layout(xaxis_title='Topics')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [23]:
df_subfield_semibalanced_02_stat = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_id ORDER BY RANDOM()) AS subfield_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 36000 -- elbow-logic 
)
SELECT 
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    stratified_sampling
GROUP BY
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
df_subfield_semibalanced_02_stat.head()

Unnamed: 0,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,number_of_papers,percent_of_papers
0,1705,Computer Networks and Communications,35999,16.05
1,1707,Computer Vision and Pattern Recognition,35999,16.05
2,1702,Artificial Intelligence,35999,16.05
3,1710,Information Systems,35999,16.05
4,1703,Computational Theory and Mathematics,34092,15.2


In [24]:
fig = px.histogram(
    df_subfield_semibalanced_02_stat,
    x='openalex_primary_topic_subfield_display_name',
    y='number_of_papers',
    title='Number of papers per Subfield'
)
fig.update_layout(xaxis_title='')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [25]:
df_topic_semibalanced_02_stat = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_id ORDER BY RANDOM()) AS subfield_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 36000 -- elbow-logic 
)
SELECT 
    openalex_primary_topic_id, 
    openalex_primary_topic_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    stratified_sampling
GROUP BY
    openalex_primary_topic_id, 
    openalex_primary_topic_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
df_topic_semibalanced_02_stat.head()

Unnamed: 0,openalex_primary_topic_id,openalex_primary_topic_display_name,number_of_papers,percent_of_papers
0,10211,Computational Drug Discovery Methods,11454,5.11
1,10028,Topic Modeling,4473,1.99
2,10270,Blockchain Technology Applications and Security,4248,1.89
3,11122,Online Learning and Analytics,3917,1.75
4,10273,IoT and Edge/Fog Computing,3815,1.7


In [22]:
fig = px.histogram(
    df_topic_semibalanced_02_stat,
    x='openalex_primary_topic_display_name',
    y='number_of_papers',
    title='Number of papers per Topic'
)
fig.update_xaxes(showticklabels=False)
fig.update_layout(xaxis_title='Topics')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [50]:
df_topic_semibalanced_03_stat = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_display_name ORDER BY RANDOM()) AS subfield_row_number,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_display_name ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 20000 AND -- elbow-logic 
        topic_row_number < 2500 -- elbow-logic 
),
counted_by_topic AS (
    SELECT
        *,
        COUNT(*) OVER (PARTITION BY openalex_primary_topic_display_name) AS new_topic_count
    FROM
        stratified_sampling
),
filtered_by_min_topic_count AS (
    SELECT
        *
    FROM
        counted_by_topic
    WHERE
        new_topic_count > 400 -- ~0.1% cutoff rate
)
SELECT 
    openalex_primary_topic_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    filtered_by_min_topic_count
GROUP BY
    openalex_primary_topic_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
display(df_topic_semibalanced_03_stat.head())
df_topic_semibalanced_03_stat.tail()

Unnamed: 0,openalex_primary_topic_display_name,number_of_papers,percent_of_papers
0,Computational Drug Discovery Methods,2499,1.8
1,Virtual Reality Applications and Impacts,2499,1.8
2,Online Learning and Analytics,2499,1.8
3,Speech and Audio Processing,2499,1.8
4,Music and Audio Processing,2499,1.8


Unnamed: 0,openalex_primary_topic_display_name,number_of_papers,percent_of_papers
101,Neural Networks Stability and Synchronization,457,0.33
102,Data Mining Algorithms and Applications,450,0.32
103,Interactive and Immersive Displays,442,0.32
104,Economic Growth and Development,404,0.29
105,Domain Adaptation and Few-Shot Learning,401,0.29


In [51]:
fig = px.histogram(
    df_topic_semibalanced_03_stat,
    x='openalex_primary_topic_display_name',
    y='number_of_papers',
    title='Number of papers per Topic'
)
fig.update_xaxes(showticklabels=False)
fig.update_layout(xaxis_title='Topics')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [52]:
df_subfield_semibalanced_03_stat = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_display_name ORDER BY RANDOM()) AS subfield_row_number,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_display_name ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 20000 AND -- elbow-logic 
        topic_row_number < 2500 -- elbow-logic 
),
counted_by_topic AS (
    SELECT
        *,
        COUNT(*) OVER (PARTITION BY openalex_primary_topic_display_name) AS new_topic_count
    FROM
        stratified_sampling
),
filtered_by_min_topic_count AS (
    SELECT
        *
    FROM
        counted_by_topic
    WHERE
        new_topic_count > 400 -- ~0.1% cutoff rate
)
SELECT 
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM stratified_sampling), 2) AS percent_of_papers
FROM
    filtered_by_min_topic_count
GROUP BY
    openalex_primary_topic_subfield_id, 
    openalex_primary_topic_subfield_display_name
ORDER BY
    number_of_papers DESC
 """, '02_stg')
df_subfield_semibalanced_03_stat.head()

Unnamed: 0,openalex_primary_topic_subfield_id,openalex_primary_topic_subfield_display_name,number_of_papers,percent_of_papers
0,1707,Computer Vision and Pattern Recognition,17058,12.27
1,1711,Signal Processing,14926,10.74
2,1705,Computer Networks and Communications,14358,10.33
3,1703,Computational Theory and Mathematics,13586,9.78
4,1710,Information Systems,11588,8.34


In [53]:
fig = px.histogram(
    df_subfield_semibalanced_03_stat,
    x='openalex_primary_topic_subfield_display_name',
    y='number_of_papers',
    title='Number of papers per Subfield'
)
fig.update_layout(xaxis_title='')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [56]:
df_semibalanced_03_stat = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_display_name ORDER BY RANDOM()) AS subfield_row_number,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_display_name ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 20000 AND -- elbow-logic 
        topic_row_number < 2500 -- elbow-logic 
),
counted_by_topic AS (
    SELECT
        *,
        COUNT(*) OVER (PARTITION BY openalex_primary_topic_display_name) AS new_topic_count
    FROM
        stratified_sampling
),
filtered_by_min_topic_count AS (
    SELECT
        *
    FROM
        counted_by_topic
    WHERE
        new_topic_count > 400 -- ~0.1% cutoff rate
)
SELECT 
    COUNT(*) AS number_of_papers,
    COUNT(DISTINCT openalex_primary_topic_subfield_display_name) AS number_of_subfields,
    COUNT(DISTINCT openalex_primary_topic_display_name) AS number_of_topics
FROM
    filtered_by_min_topic_count
 """, '02_stg')
df_semibalanced_03_stat.head()

Unnamed: 0,number_of_papers,number_of_subfields,number_of_topics
0,107724,11,106


In [5]:
df_works_semibalanced = wr.athena.read_sql_query("""
WITH
works_numbered AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_subfield_display_name ORDER BY RANDOM()) AS subfield_row_number,
        ROW_NUMBER() OVER (PARTITION BY openalex_primary_topic_display_name ORDER BY RANDOM()) AS topic_row_number
    FROM
        stg_unified_works_filtered
),
stratified_sampling AS (
    SELECT
        *
    FROM
        works_numbered
    WHERE
        subfield_row_number < 20000 AND -- elbow-logic 
        topic_row_number < 2500 -- elbow-logic 
),
counted_by_topic AS (
    SELECT
        *,
        COUNT(*) OVER (PARTITION BY openalex_primary_topic_display_name) AS new_topic_count
    FROM
        stratified_sampling
),
filtered_by_min_topic_count AS (
    SELECT
        *
    FROM
        counted_by_topic
    WHERE
        new_topic_count > 400 -- ~0.1% cutoff rate
),
----
subfield_counts AS (
    SELECT
        openalex_primary_topic_subfield_display_name,
        COUNT(*) AS openalex_primary_topic_subfield_count
    FROM
        filtered_by_min_topic_count
    GROUP BY
        openalex_primary_topic_subfield_display_name
),
subfield_index AS (
    SELECT
        *,
        (ROW_NUMBER() OVER () -1) AS openalex_primary_topic_subfield_index
    FROM
        subfield_counts
),
topic_counts AS (
    SELECT
        openalex_primary_topic_display_name,
        COUNT(*) AS openalex_primary_topic_count
    FROM
        filtered_by_min_topic_count
    GROUP BY
        openalex_primary_topic_display_name
),
topic_index AS (
    SELECT
        *,
        (ROW_NUMBER() OVER () -1) AS openalex_primary_topic_index
    FROM
        topic_counts
),
reindexed AS (
    SELECT    
        stg_unified_works_filtered.id_semanticscholar,
        stg_unified_works_filtered.id_mag,
        stg_unified_works_filtered.id_doi,
        stg_unified_works_filtered.id_arxiv,
        stg_unified_works_filtered.publication_year,
        stg_unified_works_filtered.publication_date,
        stg_unified_works_filtered.license,
        stg_unified_works_filtered.license_allows_derivative_reuse,
        stg_unified_works_filtered.source_url,
        stg_unified_works_filtered.has_id_mag,
        stg_unified_works_filtered.has_id_doi,
        stg_unified_works_filtered.has_id_mag_or_doi,
        stg_unified_works_filtered.openalex_id_openalex,
        stg_unified_works_filtered.openalex_id_doi,
        stg_unified_works_filtered.openalex_language,
        stg_unified_works_filtered.openalex_primary_topic_id,
        stg_unified_works_filtered.openalex_primary_topic_display_name,
        stg_unified_works_filtered.openalex_primary_topic_count,
        topic_index.openalex_primary_topic_index,
        stg_unified_works_filtered.openalex_primary_topic_subfield_id,
        stg_unified_works_filtered.openalex_primary_topic_subfield_display_name,
        stg_unified_works_filtered.openalex_primary_topic_subfield_count,
        subfield_index.openalex_primary_topic_subfield_index,
        stg_unified_works_filtered.openalex_primary_topic_field_id,
        stg_unified_works_filtered.openalex_primary_topic_field_display_name,
        stg_unified_works_filtered.openalex_primary_topic_domain_id,
        stg_unified_works_filtered.openalex_primary_topic_domain_display_name,
        stg_unified_works_filtered.openalex_joined_on,
        stg_unified_works_filtered.title,
        stg_unified_works_filtered.content_abstract,
        stg_unified_works_filtered.content_text,
        stg_unified_works_filtered.annotations_paragraph,
        stg_unified_works_filtered.annotations_section_header
    FROM
        stg_unified_works_filtered
        
    LEFT JOIN
        topic_index
    ON
        stg_unified_works_filtered.openalex_primary_topic_display_name = topic_index.openalex_primary_topic_display_name
    
    LEFT JOIN
        subfield_index
    ON
        stg_unified_works_filtered.openalex_primary_topic_subfield_display_name = subfield_index.openalex_primary_topic_subfield_display_name
),
reindexed_stratified AS (
    SELECT 
        *,
        NTILE(10) OVER( PARTITION BY openalex_primary_topic_display_name ORDER BY random()) AS bucket_10p
    FROM
        reindexed
),
reindexed_tagged AS (
    SELECT 
        *,
        CASE 
            WHEN bucket_10p = 1 THEN 'test'
            WHEN bucket_10p = 2 THEN 'validation'
            ELSE 'train'
        END AS subset --80-10-10 split
    FROM
        reindexed_stratified
    WHERE
        openalex_language='en' AND
        license_allows_derivative_reuse=1
)
SELECT * FROM reindexed_tagged ORDER BY RANDOM() LIMIT 100
 """, '02_stg')
df_works_semibalanced.head(100)

Unnamed: 0,id_semanticscholar,id_mag,id_doi,id_arxiv,publication_year,publication_date,license,license_allows_derivative_reuse,source_url,has_id_mag,...,openalex_primary_topic_domain_id,openalex_primary_topic_domain_display_name,openalex_joined_on,title,content_abstract,content_text,annotations_paragraph,annotations_section_header,bucket_10p,subset
0,247655188,,10.1109/access.2022.3161470,,2022,,CCBY,1,https://doi.org/10.1109/ACCESS.2022.3161470,0,...,3,Physical Sciences,doi,A Comparison of Promethee and TOPSIS Technique...,The uncertainty in the data is an obstacle in ...,\nI. INTRODUCTION\n\nM ANY complicated problem...,"[{""attributes"":null,""end"":8340,""start"":18},{""a...","[{""attributes"":null,""end"":16,""start"":1},{""attr...",4,train
1,252243320,,10.3390/s22186832,,2022,2022-09-01,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC9505052,0,...,3,Physical Sciences,doi,Efficient Clustering for Continuous Occupancy ...,This paper proposes a novel method for occupan...,\nIntroduction\n\nMapping is one of the fundam...,"[{""attributes"":null,""end"":246,""start"":15},{""at...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},...",5,train
2,253290325,,10.3390/healthcare10112189,,2022,2022-10-31,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC9690420,0,...,3,Physical Sciences,doi,A Multimodal Auxiliary Classification System f...,Histopathological examination is an important ...,\nIntroduction\n\nThe incidence of osteosarcom...,"[{""attributes"":null,""end"":584,""start"":15},{""at...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},...",5,train
3,240763749,,10.31235/osf.io/aj34w,,2020,2020-01-24,CCBY,1,https://doi.org/10.31235/osf.io/aj34w,0,...,3,Physical Sciences,doi,Platform Governance as Reflexive Coordination ...,Digital platforms have become dominant players...,\nBackground\n\nDigital platforms have become ...,"[{""attributes"":null,""end"":1087,""start"":13},{""a...","[{""attributes"":null,""end"":11,""start"":1},{""attr...",10,train
4,18957185,2168768030,10.1186/1687-1499-2013-159,,2013,2013-06-11,CCBY,1,https://doi.org/10.1186/1687-1499-2013-159,1,...,3,Physical Sciences,doi,A new mathematical analysis of the probability...,Cognitive radio (CR) enriches wireless technol...,\nIntroduction\n\nThe rapid increase of wirele...,"[{""attributes"":null,""end"":890,""start"":15},{""at...","[{""attributes"":{""n"":""1""},""end"":13,""start"":1},{...",6,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,114707019,2566913441,10.5755/j01.eie.22.6.17230,,2016,2016-08-12,CCBY,1,https://doi.org/10.5755/J01.EIE.22.6.17230,1,...,3,Physical Sciences,doi,Interconnection Contracts between Service and ...,This paper addresses interconnection contracts...,\nI. INTRODUCTION\n\nThe permanent growth of I...,"[{""attributes"":null,""end"":1041,""start"":18},{""a...","[{""attributes"":null,""end"":16,""start"":1},{""attr...",7,train
96,237592250,,10.1155/2021/2122095,,2021,2021-09-21,CCBY,1,https://pmc.ncbi.nlm.nih.gov/articles/PMC8455217,0,...,3,Physical Sciences,doi,Analysis on Health Information Acquisition of ...,This study aims to explore phenomena and laws ...,\nIntroduction\n\nWith the popularity of the I...,"[{""attributes"":null,""end"":1546,""start"":15},{""a...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},...",8,train
97,273629230,,10.6007/ijarped/v13-i4/22957,,2024,2024-10-25,CCBY,1,https://doi.org/10.6007/ijarped/v13-i4/22957,0,...,3,Physical Sciences,doi,The Problems of the Effectiveness of School-En...,The development of vocational education cannot...,\nIntroduction\n\nTaking an overview of the de...,"[{""attributes"":null,""end"":984,""start"":15},{""at...","[{""attributes"":null,""end"":13,""start"":1},{""attr...",8,train
98,22278945,2154883722,10.5772/5785,cs/0601062,2005,2005-09-01,CCBY,1,https://arxiv.org/abs/cs/0601062,1,...,3,Physical Sciences,doi,Study of Self-Organization Model of Multiple M...,A good organization model of multiple mobile r...,"\nIntroduction\n\nAt present, the research of ...","[{""attributes"":null,""end"":1509,""start"":15},{""a...","[{""attributes"":{""n"":""1.""},""end"":13,""start"":1},...",10,train


In [6]:
wr.athena.read_sql_query("""
    SELECT 
        COUNT(*) AS number_of_papers,
        COUNT(DISTINCT subfield_index) AS number_of_subfields,
        COUNT(DISTINCT topic_index) AS number_of_topics
    FROM
        unified_works_semibalanced
""", '03_core')

Unnamed: 0,number_of_papers,number_of_subfields,number_of_topics
0,108174,11,107


In [14]:
df_topic_semibalanced_final_stat = wr.athena.read_sql_query("""
SELECT 
    topic_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM unified_works_semibalanced), 2) AS percent_of_papers
FROM
    unified_works_semibalanced
GROUP BY
    topic_display_name
ORDER BY
    number_of_papers DESC
 """, '03_core')
display(df_topic_semibalanced_final_stat.head())
df_topic_semibalanced_final_stat.tail()

Unnamed: 0,topic_display_name,number_of_papers,percent_of_papers
0,Virtual Reality Applications and Impacts,2499,2.31
1,Online Learning and Analytics,2499,2.31
2,Speech and Audio Processing,2499,2.31
3,Music and Audio Processing,2499,2.31
4,Computational Drug Discovery Methods,2499,2.31


Unnamed: 0,topic_display_name,number_of_papers,percent_of_papers
102,Data Mining Algorithms and Applications,457,0.42
103,Interactive and Immersive Displays,442,0.41
104,Adversarial Robustness in Machine Learning,432,0.4
105,Spam and Phishing Detection,423,0.39
106,Economic Growth and Development,405,0.37


In [15]:
fig = px.histogram(
    df_topic_semibalanced_final_stat,
    x='topic_display_name',
    y='number_of_papers',
    title='Number of papers per Topic'
)
fig.update_xaxes(showticklabels=False)
fig.update_layout(xaxis_title='Topics')
fig.update_layout(yaxis_title='Number of papers')
fig.show()

In [16]:
df_subfield_semibalanced_final_stat = wr.athena.read_sql_query("""
SELECT 
    subfield_display_name,
    COUNT(*) AS number_of_papers,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) AS c FROM unified_works_semibalanced), 2) AS percent_of_papers
FROM
    unified_works_semibalanced
GROUP BY
    subfield_display_name
ORDER BY
    number_of_papers DESC
 """, '03_core')
df_subfield_semibalanced_final_stat

Unnamed: 0,subfield_display_name,number_of_papers,percent_of_papers
0,Computer Vision and Pattern Recognition,17031,15.74
1,Signal Processing,14926,13.8
2,Computer Networks and Communications,14287,13.21
3,Computational Theory and Mathematics,13748,12.71
4,Information Systems,12436,11.5
5,Artificial Intelligence,9850,9.11
6,Human-Computer Interaction,8467,7.83
7,Computer Science Applications,6971,6.44
8,Computer Graphics and Computer-Aided Design,4059,3.75
9,Hardware and Architecture,3883,3.59


In [17]:
fig = px.histogram(
    df_subfield_semibalanced_final_stat,
    x='subfield_display_name',
    y='number_of_papers',
    title='Number of papers per Subfield'
)
fig.update_layout(xaxis_title='')
fig.update_layout(yaxis_title='Number of papers')
fig.show()