In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import huggingface_hub
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role
from tqdm.auto import tqdm
import fsspec


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
hf_api_key = utils.get_secret(region_name=config.AWS_REGION, secret_name='HuggingFaceHub')['secret_token']
huggingface_hub.login(token=hf_api_key)

In [4]:
import boto3
import os
from pathlib import Path
import time

if False:
    s3_client = boto3.client('s3')
    bucket_name = 'sagemaker-research-methodology-extraction'
    timelogger = utils.TimeLogger()
    destination_dataset = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
    for subset in ['train', 'test', 'validation']:
        timelogger.log(f'START subset: {subset}')
        s3_folder_prefix = f'01_data/03_core/unified_works_{subset}'
        local_download_path = 'temp_files'
        utils.ensure_path(local_download_path)
    
        paginator = s3_client.get_paginator('list_objects_v2')
        file_entry_pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix)
    
        for file_entry_page in file_entry_pages:
            if 'Contents' in file_entry_page:
                for file_entry in file_entry_page['Contents']:
                    s3_key = file_entry['Key']
                    relative_path = Path(s3_key).relative_to(s3_folder_prefix)
                    path_in_repo = f'data/{subset}/{relative_path}.parquet'
                    print('path_in_repo', path_in_repo)
                    local_file_path = Path(local_download_path) / subset / relative_path
    
                    local_file_path.parent.mkdir(parents=True, exist_ok=True)
                    # print('relative_path.resolve()', str(local_file_path.resolve()), type(local_file_path.resolve()))
    
                    s3_client.download_file(bucket_name, s3_key, str(local_file_path))
                    # print(f"{subset}: Downloaded {s3_key} to {local_file_path}")
                    upload_result = huggingface_hub.upload_file(
                        path_or_fileobj=local_file_path,
                        path_in_repo=path_in_repo,
                        repo_id=destination_dataset,
                        repo_type="dataset",
                    )
                    # print(f"{subset}: Uploaded {s3_key} to {path_in_repo}")
                    if '/home/sagemaker-user/sagemaker_research_classification/src/04_transformation/temp_files/' in str(local_file_path.resolve()):
                        os.remove(local_file_path)
                        # print(f"{subset}: Deleted {s3_key} at {local_file_path}")
                    time.sleep(10)
        timelogger.log(f'END subset: {subset}')

In [10]:
from datasets import load_dataset, DatasetDict, Dataset
dataset_name = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
dataset = load_dataset(dataset_name, 'default')
dataset

README.md: 0.00B [00:00, ?B/s]

Using the latest cached version of the dataset since SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/sagemaker-user/.cache/huggingface/datasets/SteveAKopias___semantic_scholar_cs_full_text_with_open_alex_topics/default/0.0.0/4277fb6c72ba727c72d7504d852b958b6a97436c (last modified on Tue Sep 16 18:46:48 2025).


Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 310751
    })
    validation: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 38967
    })
    test: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 39017
    })
})

In [12]:
def filter_not_null(example, column_name):
    return example[column_name] is not None

In [18]:
dataset_title_subfieldindex = dataset.select_columns(['title', 'subfield_index'])
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_title_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'title_subfield', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/aa047a70d9ae6b554398ea952595097237c5ab96', commit_message='Upload dataset', commit_description='', oid='aa047a70d9ae6b554398ea952595097237c5ab96', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [19]:
dataset_title_topicindex = dataset.select_columns(['title', 'topic_index'])
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_title_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_title_topic', num_proc=8)
dataset_title_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'title_topic', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/d587f52ec5833bbd6eeb11d65dd13a6f72d65f27', commit_message='Upload dataset', commit_description='', oid='d587f52ec5833bbd6eeb11d65dd13a6f72d65f27', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [20]:
dataset_abstract_subfieldindex = dataset.select_columns(['abstract', 'subfield_index'])
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
# dataset_abstract_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_subfield', num_proc=8)
dataset_abstract_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'abstract_subfield', num_proc=8)

dataset_abstract_topicindex = dataset.select_columns(['abstract', 'topic_index'])
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_abstract_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_topic', num_proc=8)
dataset_abstract_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'abstract_topic', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

  [2m2025-09-16T23:17:46.925451Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220



Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/6e4e6688498883c6f0b8674a6d481bb9fb74a4b4', commit_message='Upload dataset', commit_description='', oid='6e4e6688498883c6f0b8674a6d481bb9fb74a4b4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [23]:
dataset_fulltext_subfieldindex = dataset.select_columns(['fulltext', 'subfield_index'])
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
# dataset_fulltext_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_subfield')
dataset_fulltext_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'fulltext_subfield', num_proc=8)

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  524kB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  239MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  241MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.68MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 28.4kB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 30.0kB /  233MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  240MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  206MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  206MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  210MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/8ae1ccebe3b0c5b79be723e3fdc84d9ce341147e', commit_message='Upload dataset', commit_description='', oid='8ae1ccebe3b0c5b79be723e3fdc84d9ce341147e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [13]:
dataset_fulltext_topicindex = dataset.select_columns(['fulltext', 'topic_index'])
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_fulltext_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_topic')
dataset_fulltext_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'fulltext_topic', num_proc=1)

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  11%|#         | 24.8MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.1MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  18%|#7        | 41.7MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.2MB /  239MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.3MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  17%|#7        | 41.7MB /  241MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  18%|#7        | 41.8MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.1MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  17%|#7        | 41.4MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  18%|#7        | 41.7MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  18%|#7        | 41.7MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.5MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.3MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.3MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.2MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.0MB /  233MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.4MB /  240MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  16%|#6        | 33.4MB /  206MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  16%|#5        | 33.4MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  16%|#6        | 33.3MB /  206MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  12%|#1        | 25.1MB /  210MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  16%|#5        | 33.4MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  16%|#5        | 33.3MB /  209MB            

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/3e48a18e34575e20f2e58249ebeb169c985276de', commit_message='Upload dataset', commit_description='', oid='3e48a18e34575e20f2e58249ebeb169c985276de', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [4]:
import boto3
import os
from pathlib import Path
import time

if True:
    s3_client = boto3.client('s3')
    bucket_name = 'sagemaker-research-methodology-extraction'
    timelogger = utils.TimeLogger()
    destination_dataset = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'
    for subset in ['train', 'test', 'validation']:
        timelogger.log(f'START subset: {subset}')
        s3_folder_prefix = f'01_data/03_core/unified_works_semibalanced_{subset}'
        local_download_path = 'temp_files'
        utils.ensure_path(local_download_path)
    
        paginator = s3_client.get_paginator('list_objects_v2')
        file_entry_pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix)
    
        for file_entry_page in file_entry_pages:
            if 'Contents' in file_entry_page:
                for file_entry in file_entry_page['Contents']:
                    s3_key = file_entry['Key']
                    relative_path = Path(s3_key).relative_to(s3_folder_prefix)
                    path_in_repo = f'data/{subset}/{relative_path}.parquet'
                    print('path_in_repo', path_in_repo)
                    local_file_path = Path(local_download_path) / subset / relative_path
    
                    local_file_path.parent.mkdir(parents=True, exist_ok=True)
                    # print('relative_path.resolve()', str(local_file_path.resolve()), type(local_file_path.resolve()))
    
                    s3_client.download_file(bucket_name, s3_key, str(local_file_path))
                    # print(f"{subset}: Downloaded {s3_key} to {local_file_path}")
                    upload_result = huggingface_hub.upload_file(
                        path_or_fileobj=local_file_path,
                        path_in_repo=path_in_repo,
                        repo_id=destination_dataset,
                        repo_type="dataset",
                    )
                    # print(f"{subset}: Uploaded {s3_key} to {path_in_repo}")
                    if '/home/sagemaker-user/sagemaker_research_classification/src/04_transformation/temp_files/' in str(local_file_path.resolve()):
                        os.remove(local_file_path)
                        # print(f"{subset}: Deleted {s3_key} at {local_file_path}")
                    time.sleep(10)
        timelogger.log(f'END subset: {subset}')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
 :: START subset: train | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/train/20250920_132133_00086_eq4yn_12cb70fa-ffa8-4087-b140-f2825cdc5874.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2cb70fa-ffa8-4087-b140-f2825cdc5874:   0%|          |  524kB /  150MB            

path_in_repo data/train/20250920_132133_00086_eq4yn_265d0fc4-614b-4688-ba7d-120d7c2c5748.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...65d0fc4-614b-4688-ba7d-120d7c2c5748:   2%|2         | 3.67MB /  152MB            

path_in_repo data/train/20250920_132133_00086_eq4yn_96261e05-d428-4377-b139-6c0bfac924b4.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6261e05-d428-4377-b139-6c0bfac924b4:   3%|2         | 3.67MB /  145MB            

path_in_repo data/train/20250920_132133_00086_eq4yn_b4eb19f0-a4ad-4c1e-9272-44d8cea4c02a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4eb19f0-a4ad-4c1e-9272-44d8cea4c02a:   2%|2         | 3.67MB /  153MB            

path_in_repo data/train/20250920_132133_00086_eq4yn_bf7370cc-8f79-4359-9a96-1429310100f3.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f7370cc-8f79-4359-9a96-1429310100f3:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250920_132133_00086_eq4yn_eec8fd29-d22b-4850-9ab6-4b24f0a00480.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ec8fd29-d22b-4850-9ab6-4b24f0a00480:   3%|2         | 3.67MB /  146MB            

 :: END subset: train | since_start: 1.0 minute, 44.31 seconds | since_last: 1.0 minute, 44.31 seconds :: 
 :: START subset: test | since_start: 1.0 minute, 44.31 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/test/20250920_132214_00047_6sgbt_1ba3264b-4063-492e-9a80-af7fa52d59c6.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ba3264b-4063-492e-9a80-af7fa52d59c6:  21%|##1       | 3.67MB / 17.3MB            

path_in_repo data/test/20250920_132214_00047_6sgbt_26cc05f6-9182-45c8-8f7c-7fe69ca7381b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6cc05f6-9182-45c8-8f7c-7fe69ca7381b:  22%|##2       | 3.67MB / 16.4MB            

path_in_repo data/test/20250920_132214_00047_6sgbt_5baa8069-592a-4115-aa1f-92ec0cf6df70.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...baa8069-592a-4115-aa1f-92ec0cf6df70:  19%|#9        | 3.67MB / 19.1MB            

path_in_repo data/test/20250920_132214_00047_6sgbt_88a66054-e92a-4c5c-9798-e32f61b92ae9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8a66054-e92a-4c5c-9798-e32f61b92ae9:  21%|##        | 3.67MB / 17.5MB            

path_in_repo data/test/20250920_132214_00047_6sgbt_cf9f7763-6b9d-4cd6-aac5-dec77eaf329e.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f9f7763-6b9d-4cd6-aac5-dec77eaf329e:  16%|#6        | 3.67MB / 22.6MB            

path_in_repo data/test/20250920_132214_00047_6sgbt_d3248a74-112a-4fcc-8492-92c4217c74dc.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...3248a74-112a-4fcc-8492-92c4217c74dc:  18%|#7        | 3.67MB / 20.6MB            

 :: END subset: test | since_start: 3.0 minutes, 2.11 seconds | since_last: 1.0 minute, 17.81 seconds :: 
 :: START subset: validation | since_start: 3.0 minutes, 2.11 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/validation/20250920_132228_00047_f5ujv_4e752d6f-1519-46fc-80ce-2409c9c4e0ad.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e752d6f-1519-46fc-80ce-2409c9c4e0ad:  17%|#7        | 3.67MB / 21.4MB            

path_in_repo data/validation/20250920_132228_00047_f5ujv_56a2e07a-a61e-4953-bb30-d6b78557a526.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6a2e07a-a61e-4953-bb30-d6b78557a526:  17%|#7        | 3.67MB / 21.1MB            

path_in_repo data/validation/20250920_132228_00047_f5ujv_68dc0697-34a4-4f52-aac3-e56b79d1721d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8dc0697-34a4-4f52-aac3-e56b79d1721d:  19%|#9        | 3.67MB / 19.0MB            

path_in_repo data/validation/20250920_132228_00047_f5ujv_80281d63-485f-4647-a88e-a49ea46b1895.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...0281d63-485f-4647-a88e-a49ea46b1895:  20%|##        | 3.67MB / 18.3MB            

path_in_repo data/validation/20250920_132228_00047_f5ujv_985a6f74-69ed-4a56-952c-a5fd56718f67.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...85a6f74-69ed-4a56-952c-a5fd56718f67:  22%|##2       | 3.67MB / 16.7MB            

path_in_repo data/validation/20250920_132228_00047_f5ujv_9959a542-3734-406e-8d22-b9250363c68b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...959a542-3734-406e-8d22-b9250363c68b:  22%|##2       | 3.67MB / 16.7MB            

 :: END subset: validation | since_start: 4.0 minutes, 19.14 seconds | since_last: 1.0 minute, 17.02 seconds :: 


In [6]:
from datasets import load_dataset, DatasetDict, Dataset
def filter_not_null(example, column_name):
    return example[column_name] is not None

dataset_name = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'
dataset = load_dataset(dataset_name, 'default')
dataset

data/train/20250920_132133_00086_eq4yn_1(…):   0%|          | 0.00/150M [00:00<?, ?B/s]

data/train/20250920_132133_00086_eq4yn_2(…):   0%|          | 0.00/152M [00:00<?, ?B/s]

data/train/20250920_132133_00086_eq4yn_9(…):   0%|          | 0.00/145M [00:00<?, ?B/s]

data/train/20250920_132133_00086_eq4yn_b(…):   0%|          | 0.00/153M [00:00<?, ?B/s]

data/train/20250920_132133_00086_eq4yn_b(…):   0%|          | 0.00/158M [00:00<?, ?B/s]

data/train/20250920_132133_00086_eq4yn_e(…):   0%|          | 0.00/146M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/21.4M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/21.1M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/19.0M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/18.3M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/16.7M [00:00<?, ?B/s]

data/validation/20250920_132228_00047_f5(…):   0%|          | 0.00/16.7M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_1b(…):   0%|          | 0.00/17.3M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_26(…):   0%|          | 0.00/16.4M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_5b(…):   0%|          | 0.00/19.1M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_88(…):   0%|          | 0.00/17.5M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_cf(…):   0%|          | 0.00/22.6M [00:00<?, ?B/s]

data/test/20250920_132214_00047_6sgbt_d3(…):   0%|          | 0.00/20.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 86461
    })
    validation: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 10850
    })
    test: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 10863
    })
})

In [7]:
dataset_title_subfieldindex = dataset.select_columns(['title', 'subfield_index'])
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_title_subfieldindex.push_to_hub(dataset_name, 'title_subfield', num_proc=4)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/0ce60be47a170cb34b8da24b32b6329296eda17b', commit_message='Upload dataset', commit_description='', oid='0ce60be47a170cb34b8da24b32b6329296eda17b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)

In [8]:
dataset_title_topicindex = dataset.select_columns(['title', 'topic_index'])
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_title_topicindex.push_to_hub(dataset_name, 'title_topic', num_proc=4)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/35f34291d8a58a9e948d323a90697823c2af927e', commit_message='Upload dataset', commit_description='', oid='35f34291d8a58a9e948d323a90697823c2af927e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)

In [9]:
dataset_abstract_subfieldindex = dataset.select_columns(['abstract', 'subfield_index'])
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_abstract_subfieldindex.push_to_hub(dataset_name, 'abstract_subfield', num_proc=4)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/22ccf6a528457e9688e2082abea2ef749bfe69f8', commit_message='Upload dataset', commit_description='', oid='22ccf6a528457e9688e2082abea2ef749bfe69f8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)

In [10]:
dataset_abstract_topicindex = dataset.select_columns(['abstract', 'topic_index'])
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_abstract_topicindex.push_to_hub(dataset_name, 'abstract_topic', num_proc=4)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=4):   0%|          | 0/4 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/77f539953bb6602997fb9845193ac5c34778a3d2', commit_message='Upload dataset', commit_description='', oid='77f539953bb6602997fb9845193ac5c34778a3d2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)

In [11]:
dataset_fulltext_subfieldindex = dataset.select_columns(['fulltext', 'subfield_index'])
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_fulltext_subfieldindex.push_to_hub(dataset_name, 'fulltext_subfield', num_proc=1)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 94.4kB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  234MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  233MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 19.8kB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 8.13kB /  239MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  177MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  177MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/acb0798083ddc36a3378a8406cd8f095f4760561', commit_message='Upload dataset', commit_description='', oid='acb0798083ddc36a3378a8406cd8f095f4760561', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)

In [12]:
dataset_fulltext_topicindex = dataset.select_columns(['fulltext', 'topic_index'])
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_fulltext_topicindex.push_to_hub(dataset_name, 'fulltext_topic', num_proc=1)

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Filter:   0%|          | 0/86461 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10850 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10863 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.4MB /  234MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.2MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.3MB /  233MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#4        | 33.5MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  14%|#3        | 33.4MB /  239MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  19%|#8        | 33.2MB /  177MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  19%|#8        | 33.4MB /  177MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced/commit/6ae2c4c90973bde039f5884358f11726ba008052', commit_message='Upload dataset', commit_description='', oid='6ae2c4c90973bde039f5884358f11726ba008052', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopicsSemibalanced'), pr_revision=None, pr_num=None)