In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role
from huggingface_hub import create_repo, upload_file, login
from tqdm.auto import tqdm
import fsspec


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
_ = """
fs = fsspec.filesystem('s3')  # s3 / gcs / abfs / adl / oci / ...
subset = 'test'
data_dir = f's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_{subset}/'
pattern = '*'
data_files = fs.glob(data_dir + pattern)

utils.ensure_path('temp_files')
# create_repo(destination_dataset, repo_type='dataset')
for data_file in tqdm(fs.glob(data_dir + pattern)):
    with fs.open(data_file, 'rb') as fileobj:
        path_in_repo = f'data/{subset}/{data_file[len(data_dir):]}'
        print(path_in_repo, type(fileobj))
        upload_file(
            path_or_fileobj=fileobj,
            path_in_repo=path_in_repo,
            repo_id=destination_dataset,
            repo_type="dataset",
        )
"""

In [9]:
_ = """
import smart_open

bucket = 'sagemaker-research-methodology-extraction'
subset = 'test'
data_dir = f's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works_{subset}/'
for key, content in s3.iter_bucket(bucket, prefix=data_dir, workers=8):
"""

In [10]:
import boto3
import os
from pathlib import Path
import time

s3_client = boto3.client('s3')
bucket_name = 'sagemaker-research-methodology-extraction'
timelogger = utils.TimeLogger()
destination_dataset = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
for subset in ['train', 'test', 'validation']:
    timelogger.log(f'START subset: {subset}')
    s3_folder_prefix = f'01_data/03_core/unified_works_{subset}'
    local_download_path = 'temp_files'
    utils.ensure_path(local_download_path)

    paginator = s3_client.get_paginator('list_objects_v2')
    file_entry_pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix)

    for file_entry_page in file_entry_pages:
        if 'Contents' in file_entry_page:
            for file_entry in file_entry_page['Contents']:
                s3_key = file_entry['Key']
                relative_path = Path(s3_key).relative_to(s3_folder_prefix)
                path_in_repo = f'data/{subset}/{relative_path}.parquet'
                print('path_in_repo', path_in_repo)
                local_file_path = Path(local_download_path) / subset / relative_path

                local_file_path.parent.mkdir(parents=True, exist_ok=True)
                # print('relative_path.resolve()', str(local_file_path.resolve()), type(local_file_path.resolve()))

                s3_client.download_file(bucket_name, s3_key, str(local_file_path))
                # print(f"{subset}: Downloaded {s3_key} to {local_file_path}")
                upload_result = upload_file(
                    path_or_fileobj=local_file_path,
                    path_in_repo=path_in_repo,
                    repo_id=destination_dataset,
                    repo_type="dataset",
                )
                # print(f"{subset}: Uploaded {s3_key} to {path_in_repo}")
                if '/home/sagemaker-user/research_methodology_extraction/src/04_transformation/temp_files/' in str(local_file_path.resolve()):
                    os.remove(local_file_path)
                    # print(f"{subset}: Deleted {s3_key} at {local_file_path}")
                time.sleep(10)
    timelogger.log(f'END subset: {subset}')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
 :: START subset: train | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/train/20250904_234024_00199_vpvv2_03ed3d45-472c-46a2-89d2-f53707dacbe5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...3ed3d45-472c-46a2-89d2-f53707dacbe5:   0%|          |  524kB /  154MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_04d8da34-f1d0-4570-9f40-9d5aef04cc2f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4d8da34-f1d0-4570-9f40-9d5aef04cc2f:   2%|2         | 3.67MB /  155MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_053fac04-dd34-4aef-95d6-04b34fcbf590.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...53fac04-dd34-4aef-95d6-04b34fcbf590:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_0fb7ddc7-15e1-46c2-8764-ba9b062e31dc.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...fb7ddc7-15e1-46c2-8764-ba9b062e31dc:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_15f74530-b14c-45b3-9880-39d5720074b4.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...5f74530-b14c-45b3-9880-39d5720074b4:   2%|2         | 3.67MB /  155MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_3949d684-e65e-42d5-9976-4fbd9f48090d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...949d684-e65e-42d5-9976-4fbd9f48090d:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_3b922df1-ccbf-4853-9b31-d58de8130ed9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...b922df1-ccbf-4853-9b31-d58de8130ed9:   2%|2         | 3.67MB /  166MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_6038e1cf-eb4e-4e8a-8a76-38bc943a82a5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...038e1cf-eb4e-4e8a-8a76-38bc943a82a5:   2%|2         | 3.67MB /  160MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_759e7a36-104b-498f-b3e5-46dab315e706.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...59e7a36-104b-498f-b3e5-46dab315e706:   2%|2         | 3.67MB /  165MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_7f04693e-0341-4cd0-9f3a-6fdc0570470d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f04693e-0341-4cd0-9f3a-6fdc0570470d:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_82628148-b844-4de2-b0cf-4d1ebab730a5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2628148-b844-4de2-b0cf-4d1ebab730a5:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_98e7da39-504f-4845-a204-98467490112b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8e7da39-504f-4845-a204-98467490112b:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_995cdbf0-6025-44de-836b-13160d13cfe8.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...95cdbf0-6025-44de-836b-13160d13cfe8:   0%|          |  131kB /  155MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_a196f10b-1288-46df-a3c8-e1ccc6394036.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...196f10b-1288-46df-a3c8-e1ccc6394036:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_a7713f50-59d2-47e9-a384-436450090f9f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...7713f50-59d2-47e9-a384-436450090f9f:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_a963ac94-b644-49e3-9b60-1ac02ea4406d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...963ac94-b644-49e3-9b60-1ac02ea4406d:   2%|2         | 3.67MB /  155MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_ba800907-5676-4d28-aee3-bd8e294f1019.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a800907-5676-4d28-aee3-bd8e294f1019:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_be15828a-e778-4125-8d14-7c647535756a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e15828a-e778-4125-8d14-7c647535756a:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_c5183ca4-55fa-4c8f-87ef-e593ca4c573e.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...5183ca4-55fa-4c8f-87ef-e593ca4c573e:   2%|2         | 3.67MB /  155MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_c926669b-7e59-4826-b31a-b5deae125981.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...926669b-7e59-4826-b31a-b5deae125981:   2%|2         | 3.67MB /  152MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_e11d9808-a1fa-4ffe-ad59-6aff27f542d9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...11d9808-a1fa-4ffe-ad59-6aff27f542d9:   2%|2         | 3.67MB /  153MB            

path_in_repo data/train/20250904_234024_00199_vpvv2_e500a727-f3fe-4af6-8b30-5a366e907b69.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...500a727-f3fe-4af6-8b30-5a366e907b69:   2%|2         | 3.67MB /  155MB            

 :: END subset: train | since_start: 6.0 minutes, 26.34 seconds | since_last: 6.0 minutes, 26.34 seconds :: 
 :: START subset: test | since_start: 6.0 minutes, 26.34 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/test/20250904_234106_00279_uggtr_0047265d-cb10-4193-b934-761de9c15dd5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...047265d-cb10-4193-b934-761de9c15dd5:  17%|#6        | 3.67MB / 21.8MB            

path_in_repo data/test/20250904_234106_00279_uggtr_0e0ffbe0-11ae-4b2c-9fda-c548472802b2.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e0ffbe0-11ae-4b2c-9fda-c548472802b2:  16%|#6        | 3.67MB / 22.5MB            

path_in_repo data/test/20250904_234106_00279_uggtr_24860b0a-2255-4219-a071-33ab587aa107.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4860b0a-2255-4219-a071-33ab587aa107:  17%|#7        | 3.68MB / 21.4MB            

path_in_repo data/test/20250904_234106_00279_uggtr_2cb21912-20a1-490c-b158-2e23af73456e.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...cb21912-20a1-490c-b158-2e23af73456e:  20%|##        | 3.67MB / 18.3MB            

path_in_repo data/test/20250904_234106_00279_uggtr_44761367-f198-4188-9981-c66b9bab051e.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4761367-f198-4188-9981-c66b9bab051e:  21%|##1       | 3.67MB / 17.4MB            

path_in_repo data/test/20250904_234106_00279_uggtr_58468fc1-7d10-4f42-9d3c-876a034922b7.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8468fc1-7d10-4f42-9d3c-876a034922b7:  19%|#8        | 3.67MB / 19.5MB            

path_in_repo data/test/20250904_234106_00279_uggtr_5cc88fae-6466-4568-b4ca-3ad31ef34109.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...cc88fae-6466-4568-b4ca-3ad31ef34109:  18%|#8        | 3.67MB / 20.2MB            

path_in_repo data/test/20250904_234106_00279_uggtr_6b538d6a-47d3-4ca9-9449-f59f22d17326.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...b538d6a-47d3-4ca9-9449-f59f22d17326:  17%|#6        | 3.67MB / 21.7MB            

path_in_repo data/test/20250904_234106_00279_uggtr_6f793f72-5205-4a3c-8071-d894761752f8.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f793f72-5205-4a3c-8071-d894761752f8:  20%|##        | 3.67MB / 18.0MB            

path_in_repo data/test/20250904_234106_00279_uggtr_75be7cdb-a866-4386-80de-17d51046e80b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...5be7cdb-a866-4386-80de-17d51046e80b:  22%|##1       | 3.67MB / 17.0MB            

path_in_repo data/test/20250904_234106_00279_uggtr_78468385-bb35-443b-a221-0918eeae503f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8468385-bb35-443b-a221-0918eeae503f:  19%|#9        | 3.67MB / 19.2MB            

path_in_repo data/test/20250904_234106_00279_uggtr_7ee73fea-7782-4f6e-83f7-300743b102ad.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ee73fea-7782-4f6e-83f7-300743b102ad:  17%|#6        | 3.67MB / 22.1MB            

path_in_repo data/test/20250904_234106_00279_uggtr_80178432-5db3-46c9-a9eb-5fa20fa21cf9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...0178432-5db3-46c9-a9eb-5fa20fa21cf9:   3%|2         |  524kB / 20.7MB            

path_in_repo data/test/20250904_234106_00279_uggtr_a5c371a6-f195-4f3c-a697-5430c1238be6.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...5c371a6-f195-4f3c-a697-5430c1238be6:  18%|#8        | 3.67MB / 20.3MB            

path_in_repo data/test/20250904_234106_00279_uggtr_a6186b47-d241-47e1-a39f-baaa18ca3738.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6186b47-d241-47e1-a39f-baaa18ca3738:  20%|##        | 3.67MB / 18.2MB            

path_in_repo data/test/20250904_234106_00279_uggtr_aebce86c-564b-45f5-9872-d248270da0f5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ebce86c-564b-45f5-9872-d248270da0f5:  19%|#8        | 3.67MB / 19.4MB            

path_in_repo data/test/20250904_234106_00279_uggtr_b2ddadc6-f0e3-44d5-9e32-9a9a963c4dab.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2ddadc6-f0e3-44d5-9e32-9a9a963c4dab:  18%|#7        | 3.67MB / 20.7MB            

path_in_repo data/test/20250904_234106_00279_uggtr_b740708c-20d4-40e2-8db2-3866eac95bd2.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...740708c-20d4-40e2-8db2-3866eac95bd2:  20%|##        | 3.67MB / 18.1MB            

path_in_repo data/test/20250904_234106_00279_uggtr_c26e1ff6-ae65-4b5c-b843-ab4a761c466c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...26e1ff6-ae65-4b5c-b843-ab4a761c466c:  20%|#9        | 3.67MB / 18.5MB            

path_in_repo data/test/20250904_234106_00279_uggtr_cbb2cd11-c393-4ed8-a488-cf3a5506e704.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...bb2cd11-c393-4ed8-a488-cf3a5506e704:  20%|#9        | 3.67MB / 18.4MB            

path_in_repo data/test/20250904_234106_00279_uggtr_cfa86576-dd5b-4038-bf78-a213fe6a88ad.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...fa86576-dd5b-4038-bf78-a213fe6a88ad:  17%|#7        | 3.67MB / 21.6MB            

path_in_repo data/test/20250904_234106_00279_uggtr_d949b83e-2537-40b2-a4fa-dffbeff72975.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...949b83e-2537-40b2-a4fa-dffbeff72975:  21%|##        | 3.67MB / 17.8MB            

 :: END subset: test | since_start: 11.0 minutes, 14.70 seconds | since_last: 4.0 minutes, 48.36 seconds :: 
 :: START subset: validation | since_start: 11.0 minutes, 14.70 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/validation/20250904_234123_00159_rh4vh_112024de-4e21-4ed1-90c6-31caec405cba.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...12024de-4e21-4ed1-90c6-31caec405cba:  23%|##2       | 3.67MB / 16.3MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_210108bf-92fa-41e0-915e-2cd0427ad1c5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...10108bf-92fa-41e0-915e-2cd0427ad1c5:  18%|#8        | 3.67MB / 20.2MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_2beed655-521b-459a-b04b-5c25f74dfe58.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...beed655-521b-459a-b04b-5c25f74dfe58:  18%|#7        | 3.67MB / 20.5MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_34ab1883-fd1f-4c84-84ac-337e1f7f079d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4ab1883-fd1f-4c84-84ac-337e1f7f079d:  19%|#8        | 3.67MB / 19.6MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_3ac9df0a-d121-45e3-805f-eac4b6e63bfd.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ac9df0a-d121-45e3-805f-eac4b6e63bfd:  18%|#8        | 3.67MB / 20.0MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_4209b512-cc61-472f-a13f-6caf732dd4a4.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...209b512-cc61-472f-a13f-6caf732dd4a4:   3%|2         |  524kB / 19.5MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_65d865e9-5430-438a-97df-6219367064c1.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...5d865e9-5430-438a-97df-6219367064c1:  18%|#7        | 3.67MB / 20.9MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_6bcf208c-a8fc-44d3-b82e-aa47f64b8206.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...bcf208c-a8fc-44d3-b82e-aa47f64b8206:  19%|#8        | 3.67MB / 19.5MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_7c41f080-f6f1-4674-82c5-7de52c055fb5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...c41f080-f6f1-4674-82c5-7de52c055fb5:  17%|#7        | 3.67MB / 21.2MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_7dfcfd92-1ab5-490e-a18f-e00d6b0a0a76.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...dfcfd92-1ab5-490e-a18f-e00d6b0a0a76:  17%|#7        | 3.67MB / 21.6MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_8293aa8c-18b5-4522-8b42-47480cfbf82d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...293aa8c-18b5-4522-8b42-47480cfbf82d:  19%|#9        | 3.67MB / 19.1MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_829b7b01-09a3-4d06-bfbf-cb36e4160330.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...29b7b01-09a3-4d06-bfbf-cb36e4160330:  16%|#6        | 3.67MB / 22.8MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_8f502bf0-0b9b-4d2b-881b-e5c3bc9c9cb1.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f502bf0-0b9b-4d2b-881b-e5c3bc9c9cb1:  19%|#9        | 3.67MB / 19.1MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_92441814-2418-479c-a86a-e4d64903d45b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2441814-2418-479c-a86a-e4d64903d45b:  16%|#5        | 3.67MB / 23.1MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_b21f820e-3e05-48b9-8658-d1c790347336.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...21f820e-3e05-48b9-8658-d1c790347336:  22%|##1       | 3.67MB / 17.0MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_b791bda4-6120-48ab-ba8a-c06af988d314.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...791bda4-6120-48ab-ba8a-c06af988d314:  18%|#7        | 3.67MB / 21.0MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_c2a74129-fa34-4945-b394-b5a6a8f297ee.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2a74129-fa34-4945-b394-b5a6a8f297ee:  18%|#8        | 3.67MB / 19.9MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_c946eec2-3c22-414a-a26a-4c5f5503277c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...946eec2-3c22-414a-a26a-4c5f5503277c:  18%|#8        | 3.67MB / 20.3MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_de9c9a29-eeae-4367-98a9-21d900df7ec8.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e9c9a29-eeae-4367-98a9-21d900df7ec8:   3%|3         |  525kB / 17.4MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_e69a1c44-fb49-4f56-ba1c-c3b1e30b8b49.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...69a1c44-fb49-4f56-ba1c-c3b1e30b8b49:  22%|##2       | 3.67MB / 16.6MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_f4138a7b-4cf3-4c8f-a3bd-fe913f7dfe26.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4138a7b-4cf3-4c8f-a3bd-fe913f7dfe26:  18%|#7        | 3.67MB / 20.8MB            

path_in_repo data/validation/20250904_234123_00159_rh4vh_f56b0dcb-febb-4f5b-a9f0-4c9f1c856e21.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...56b0dcb-febb-4f5b-a9f0-4c9f1c856e21:  23%|##2       | 3.67MB / 16.2MB            

 :: END subset: validation | since_start: 16.0 minutes, 1.87 seconds | since_last: 4.0 minutes, 47.17 seconds :: 


In [5]:
from datasets import load_dataset, DatasetDict, Dataset
dataset_name = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
dataset = load_dataset(dataset_name)
dataset

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 346344
    })
    validation: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 43439
    })
    test: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 43478
    })
})

In [6]:
def filter_not_null(example, column_name):
    return example[column_name] is not None

In [7]:
dataset_title_subfieldindex = dataset.select_columns(['title', 'subfield_index'])
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_title_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_title_subfield', num_proc=8)

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43478 [00:00<?, ? examples/s]

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43478 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_subfield/commit/44b6353afce6557021f730b131dd918197ba5194', commit_message='Upload dataset', commit_description='', oid='44b6353afce6557021f730b131dd918197ba5194', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_subfield', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_subfield'), pr_revision=None, pr_num=None)

In [8]:
dataset_title_topicindex = dataset.select_columns(['title', 'topic_index'])
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_title_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_title_topic', num_proc=8)

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43478 [00:00<?, ? examples/s]

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43478 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_topic/commit/fd46b397eb1721a6a8d9faa34e3fc6f946c41b7c', commit_message='Upload dataset', commit_description='', oid='fd46b397eb1721a6a8d9faa34e3fc6f946c41b7c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_topic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_title_topic'), pr_revision=None, pr_num=None)

In [10]:
dataset_abstract_subfieldindex = dataset.select_columns(['abstract', 'subfield_index'])
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_abstract_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_subfield', num_proc=8)

dataset_abstract_topicindex = dataset.select_columns(['abstract', 'topic_index'])
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_abstract_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_topic', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43478 [00:00<?, ? examples/s]

Filter:   0%|          | 0/340161 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42640 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42715 [00:00<?, ? examples/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_abstract_topic/commit/74a8b8ea0fb4fec89fc22e9ef5ede553b50cee51', commit_message='Upload dataset', commit_description='', oid='74a8b8ea0fb4fec89fc22e9ef5ede553b50cee51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_abstract_topic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics_abstract_topic'), pr_revision=None, pr_num=None)

In [None]:
dataset_fulltext_subfieldindex = dataset.select_columns(['fulltext', 'subfield_index'])
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_fulltext_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_subfield')

Filter:   0%|          | 0/346344 [00:00<?, ? examples/s]

In [None]:
dataset_fulltext_topicindex = dataset.select_columns(['fulltext', 'topic_index'])
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
dataset_fulltext_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_topic')