In [6]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import huggingface_hub
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role
from tqdm.auto import tqdm
import fsspec


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


utils.py loaded: v0.2.12
config.py loaded: v0.1


In [7]:
hf_api_key = utils.get_secret(region_name=config.AWS_REGION, secret_name='HuggingFaceHub')['secret_token']
login(token=hf_api_key)

In [9]:
import boto3
import os
from pathlib import Path
import time

s3_client = boto3.client('s3')
bucket_name = 'sagemaker-research-methodology-extraction'
timelogger = utils.TimeLogger()
destination_dataset = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
for subset in ['train', 'test', 'validation']:
    timelogger.log(f'START subset: {subset}')
    s3_folder_prefix = f'01_data/03_core/unified_works_{subset}'
    local_download_path = 'temp_files'
    utils.ensure_path(local_download_path)

    paginator = s3_client.get_paginator('list_objects_v2')
    file_entry_pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix)

    for file_entry_page in file_entry_pages:
        if 'Contents' in file_entry_page:
            for file_entry in file_entry_page['Contents']:
                s3_key = file_entry['Key']
                relative_path = Path(s3_key).relative_to(s3_folder_prefix)
                path_in_repo = f'data/{subset}/{relative_path}.parquet'
                print('path_in_repo', path_in_repo)
                local_file_path = Path(local_download_path) / subset / relative_path

                local_file_path.parent.mkdir(parents=True, exist_ok=True)
                # print('relative_path.resolve()', str(local_file_path.resolve()), type(local_file_path.resolve()))

                s3_client.download_file(bucket_name, s3_key, str(local_file_path))
                # print(f"{subset}: Downloaded {s3_key} to {local_file_path}")
                upload_result = huggingface_hub.upload_file(
                    path_or_fileobj=local_file_path,
                    path_in_repo=path_in_repo,
                    repo_id=destination_dataset,
                    repo_type="dataset",
                )
                # print(f"{subset}: Uploaded {s3_key} to {path_in_repo}")
                if '/home/sagemaker-user/sagemaker_research_classification/src/04_transformation/temp_files/' in str(local_file_path.resolve()):
                    os.remove(local_file_path)
                    # print(f"{subset}: Deleted {s3_key} at {local_file_path}")
                time.sleep(10)
    timelogger.log(f'END subset: {subset}')

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
 :: START subset: train | since_start: 0.00 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/train/20250916_180551_00015_m9m25_0882337b-8fca-49fb-aced-4aff00b2008a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...882337b-8fca-49fb-aced-4aff00b2008a:   0%|          |  524kB /  159MB            

path_in_repo data/train/20250916_180551_00015_m9m25_1e45f779-5735-4913-a61f-6d51a3a800d1.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e45f779-5735-4913-a61f-6d51a3a800d1:   2%|2         | 3.67MB /  159MB            

path_in_repo data/train/20250916_180551_00015_m9m25_27056962-2577-4e2b-b25e-dd6aa05b5d7d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...7056962-2577-4e2b-b25e-dd6aa05b5d7d:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250916_180551_00015_m9m25_310fb762-ee1c-4a2a-b2a0-d174efea696a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...10fb762-ee1c-4a2a-b2a0-d174efea696a:   2%|2         | 3.67MB /  160MB            

path_in_repo data/train/20250916_180551_00015_m9m25_3113264d-0a2f-4f93-aedc-4a6ffa91b5f7.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...113264d-0a2f-4f93-aedc-4a6ffa91b5f7:   2%|2         | 3.67MB /  169MB            

path_in_repo data/train/20250916_180551_00015_m9m25_395fa77f-eb37-4e3d-916a-b6b42fdf3cb2.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...95fa77f-eb37-4e3d-916a-b6b42fdf3cb2:   2%|2         | 3.67MB /  159MB            

path_in_repo data/train/20250916_180551_00015_m9m25_3e7e64a6-8fde-45b8-bb20-2e63361cad1b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e7e64a6-8fde-45b8-bb20-2e63361cad1b:   2%|2         | 3.67MB /  160MB            

path_in_repo data/train/20250916_180551_00015_m9m25_68fe6222-1c4d-4992-b273-65d1a6221590.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8fe6222-1c4d-4992-b273-65d1a6221590:   2%|2         | 3.67MB /  160MB            

path_in_repo data/train/20250916_180551_00015_m9m25_742129af-1c5a-4f9e-8ca8-e12065e3f140.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...42129af-1c5a-4f9e-8ca8-e12065e3f140:   2%|2         | 3.67MB /  154MB            

path_in_repo data/train/20250916_180551_00015_m9m25_778d8a94-dda8-44be-8187-052e5d1c85a7.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...78d8a94-dda8-44be-8187-052e5d1c85a7:   2%|2         | 3.67MB /  161MB            

path_in_repo data/train/20250916_180551_00015_m9m25_789b46fc-e866-4b11-967a-0d7667fa6816.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...89b46fc-e866-4b11-967a-0d7667fa6816:   2%|2         | 3.67MB /  163MB            

path_in_repo data/train/20250916_180551_00015_m9m25_90bcc1f8-18ac-4a62-b2a3-b3aeb2c8eb66.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...0bcc1f8-18ac-4a62-b2a3-b3aeb2c8eb66:   2%|2         | 3.67MB /  156MB            

path_in_repo data/train/20250916_180551_00015_m9m25_b2a408bb-a10e-4de2-a475-577e86a586aa.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...2a408bb-a10e-4de2-a475-577e86a586aa:   2%|2         | 3.67MB /  160MB            

path_in_repo data/train/20250916_180551_00015_m9m25_c3c4586e-f3c2-4dd5-8780-574210cf064d.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...3c4586e-f3c2-4dd5-8780-574210cf064d:   2%|2         | 3.67MB /  157MB            

path_in_repo data/train/20250916_180551_00015_m9m25_ca4b3167-b3d9-45e8-b8c6-1dd5954ee850.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a4b3167-b3d9-45e8-b8c6-1dd5954ee850:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250916_180551_00015_m9m25_ce12d509-0e34-47df-b2be-3fabf77ca245.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...e12d509-0e34-47df-b2be-3fabf77ca245:   2%|2         | 3.67MB /  155MB            

path_in_repo data/train/20250916_180551_00015_m9m25_dd154dfd-9831-41c4-9682-c1b9fd35eaf9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d154dfd-9831-41c4-9682-c1b9fd35eaf9:   2%|2         | 3.67MB /  158MB            

path_in_repo data/train/20250916_180551_00015_m9m25_e1491ec6-0637-497f-8192-0709c5cb01de.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...1491ec6-0637-497f-8192-0709c5cb01de:   2%|2         | 3.67MB /  161MB            

path_in_repo data/train/20250916_180551_00015_m9m25_f353fcaa-665c-4df0-8784-5f43d0fb1a8c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...353fcaa-665c-4df0-8784-5f43d0fb1a8c:   2%|2         | 3.67MB /  159MB            

path_in_repo data/train/20250916_180551_00015_m9m25_fd316e9f-c0f7-4b2e-b331-8340a7bafff9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d316e9f-c0f7-4b2e-b331-8340a7bafff9:   2%|2         | 3.67MB /  171MB            

 :: END subset: train | since_start: 5.0 minutes, 56.54 seconds | since_last: 5.0 minutes, 56.54 seconds :: 
 :: START subset: test | since_start: 5.0 minutes, 56.54 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/test/20250916_180635_00031_zx8g4_0c724e40-3c4c-4c15-8f7a-c73c136a248a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...c724e40-3c4c-4c15-8f7a-c73c136a248a:  21%|##        | 3.67MB / 17.7MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_0f7c6cfd-04d3-42bd-befc-f8112d82f61c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f7c6cfd-04d3-42bd-befc-f8112d82f61c:  20%|#9        | 3.67MB / 18.4MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_1f408785-6cf4-4dd7-b0d5-f85dca96084a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f408785-6cf4-4dd7-b0d5-f85dca96084a:  17%|#6        | 3.67MB / 21.7MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_21f6ab0d-cf0d-4f56-937e-45ec655bb635.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...1f6ab0d-cf0d-4f56-937e-45ec655bb635:  19%|#9        | 3.67MB / 18.9MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_2b2804db-aff1-49c6-8446-e7848e96b257.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...b2804db-aff1-49c6-8446-e7848e96b257:  20%|##        | 3.67MB / 18.3MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_393ff709-c8ca-49c9-bda3-d077b1b20875.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...93ff709-c8ca-49c9-bda3-d077b1b20875:  19%|#9        | 3.67MB / 19.2MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_3a7e6ec8-c129-4df5-b973-35102c082c5a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a7e6ec8-c129-4df5-b973-35102c082c5a:  20%|##        | 3.67MB / 18.1MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_3d201197-6b58-49bf-8c1d-5029c464a93a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d201197-6b58-49bf-8c1d-5029c464a93a:  19%|#8        | 3.67MB / 19.7MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_50ed302f-3077-49d0-9a58-e90a43c38264.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...0ed302f-3077-49d0-9a58-e90a43c38264:  16%|#5        | 3.67MB / 23.1MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_53636e53-7b0c-4d74-a7a7-740061cb38fd.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...3636e53-7b0c-4d74-a7a7-740061cb38fd:  18%|#8        | 3.67MB / 20.2MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_6597c88a-d774-40d1-943f-68d0b986df60.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...597c88a-d774-40d1-943f-68d0b986df60:  18%|#8        | 3.67MB / 20.3MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_87bd64e9-5551-4d0b-aa56-c57dd0995deb.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...7bd64e9-5551-4d0b-aa56-c57dd0995deb:  18%|#8        | 3.67MB / 20.0MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_96412875-f233-4789-8d48-4f1e6b1301b8.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6412875-f233-4789-8d48-4f1e6b1301b8:  20%|#9        | 3.67MB / 18.5MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_9d65628b-b41f-49d0-a886-bbcd41cd260f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d65628b-b41f-49d0-a886-bbcd41cd260f:  17%|#7        | 3.67MB / 21.5MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_a8ac64c7-9398-4c83-8631-2af821a07b46.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8ac64c7-9398-4c83-8631-2af821a07b46:  18%|#7        | 3.67MB / 20.6MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_ad765c96-3195-4997-b58a-c43324b9187f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d765c96-3195-4997-b58a-c43324b9187f:  16%|#6        | 3.67MB / 22.3MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_aff3ef96-f070-4535-8ab6-fb95f5b1524b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ff3ef96-f070-4535-8ab6-fb95f5b1524b:  19%|#8        | 3.67MB / 19.3MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_bd92f45e-236e-420f-8b24-6e32fe6bce0f.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d92f45e-236e-420f-8b24-6e32fe6bce0f:  17%|#7        | 3.67MB / 21.3MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_cbc715d3-5f90-4fc4-ac8f-b8bad00d70f3.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...bc715d3-5f90-4fc4-ac8f-b8bad00d70f3:  16%|#5        | 3.67MB / 23.2MB            

path_in_repo data/test/20250916_180635_00031_zx8g4_cfa4c48e-6f30-46a3-a9cb-b29e461369b0.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...fa4c48e-6f30-46a3-a9cb-b29e461369b0:  17%|#7        | 3.67MB / 21.3MB            

 :: END subset: test | since_start: 10.0 minutes, 27.67 seconds | since_last: 4.0 minutes, 31.13 seconds :: 
 :: START subset: validation | since_start: 10.0 minutes, 27.67 seconds | since_last: 0.00 seconds :: 
ensure_path(temp_files)
path_in_repo data/validation/20250916_180654_00055_aqyhz_3fa1c830-9df5-49db-a3c9-35db1ae25c19.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...fa1c830-9df5-49db-a3c9-35db1ae25c19:  19%|#8        | 3.67MB / 19.8MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_481dd389-2920-4179-aa00-d4434758dcc9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...81dd389-2920-4179-aa00-d4434758dcc9:  18%|#7        | 3.67MB / 20.6MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_520c466d-c926-408c-925a-79976287d128.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...20c466d-c926-408c-925a-79976287d128:  17%|#7        | 3.67MB / 21.3MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_54d839bd-2166-4b7d-bcf5-1d42bbfa5559.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4d839bd-2166-4b7d-bcf5-1d42bbfa5559:  17%|#7        | 3.67MB / 21.6MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_59191735-327d-46ad-91d5-35554b647348.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...9191735-327d-46ad-91d5-35554b647348:  18%|#7        | 3.67MB / 20.7MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_5a9d8044-a1d4-49bb-bb17-f80914825fe4.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a9d8044-a1d4-49bb-bb17-f80914825fe4:  18%|#8        | 3.67MB / 20.1MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_6059c126-3511-443b-8bb7-156b33f6822b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...059c126-3511-443b-8bb7-156b33f6822b:  22%|##2       | 3.67MB / 16.5MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_63376361-5835-428a-90f8-3f632c16028c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...3376361-5835-428a-90f8-3f632c16028c:  16%|#6        | 3.67MB / 22.7MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_734539b6-6247-4caa-a62e-e05926c457f5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...34539b6-6247-4caa-a62e-e05926c457f5:  19%|#8        | 3.67MB / 19.7MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_778fd06d-06c8-4d14-9b41-d1780d76772e.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...78fd06d-06c8-4d14-9b41-d1780d76772e:  17%|#7        | 3.67MB / 21.6MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_79056820-e455-4153-9648-79ce06220ed5.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...9056820-e455-4153-9648-79ce06220ed5:  18%|#8        | 3.67MB / 19.9MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_7cd1f9af-9c6e-4d9b-820f-7f7be2243089.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...cd1f9af-9c6e-4d9b-820f-7f7be2243089:  19%|#8        | 3.67MB / 19.8MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_86ad521f-0966-4f45-8b8d-0eaa7201160a.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6ad521f-0966-4f45-8b8d-0eaa7201160a:  20%|##        | 3.67MB / 18.0MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_94ee9729-acd2-45eb-ac51-8941acea8b5b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...4ee9729-acd2-45eb-ac51-8941acea8b5b:  17%|#7        | 3.67MB / 21.4MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_98a9634e-b17f-4d22-852e-f0926a82b697.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...8a9634e-b17f-4d22-852e-f0926a82b697:  21%|##1       | 3.67MB / 17.1MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_9c9cf8c5-e883-4183-9e9f-dd8580a67619.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...c9cf8c5-e883-4183-9e9f-dd8580a67619:  18%|#8        | 3.67MB / 19.8MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_bc9957f3-5b7a-4666-9c50-cb03f909073b.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...c9957f3-5b7a-4666-9c50-cb03f909073b:  19%|#8        | 3.67MB / 19.4MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_c6d693bf-db91-45d7-b4ae-227226352c87.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6d693bf-db91-45d7-b4ae-227226352c87:  21%|##        | 3.67MB / 17.6MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_dbcafd3e-f59c-4470-a732-72bf319cdee9.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...bcafd3e-f59c-4470-a732-72bf319cdee9:  19%|#8        | 3.67MB / 19.6MB            

path_in_repo data/validation/20250916_180654_00055_aqyhz_f7d060bb-d38f-409e-a953-fde1127f458c.parquet


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...7d060bb-d38f-409e-a953-fde1127f458c:  16%|#6        | 3.67MB / 22.3MB            

 :: END subset: validation | since_start: 15.0 minutes, 5.79 seconds | since_last: 4.0 minutes, 38.12 seconds :: 


In [10]:
from datasets import load_dataset, DatasetDict, Dataset
dataset_name = 'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'
dataset = load_dataset(dataset_name)
dataset

README.md:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/20 [00:00<?, ?files/s]

data/train/20250916_180551_00015_m9m25_0(…):   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_1(…):   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_2(…):   0%|          | 0.00/158M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_3(…):   0%|          | 0.00/160M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_3(…):   0%|          | 0.00/169M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_3(…):   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_3(…):   0%|          | 0.00/160M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_6(…):   0%|          | 0.00/160M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_7(…):   0%|          | 0.00/154M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_7(…):   0%|          | 0.00/161M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_7(…):   0%|          | 0.00/163M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_9(…):   0%|          | 0.00/156M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_b(…):   0%|          | 0.00/160M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_c(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_c(…):   0%|          | 0.00/158M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_c(…):   0%|          | 0.00/155M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_d(…):   0%|          | 0.00/158M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_e(…):   0%|          | 0.00/161M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_f(…):   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train/20250916_180551_00015_m9m25_f(…):   0%|          | 0.00/171M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/20 [00:00<?, ?files/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/20.6M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/21.6M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/20.7M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/16.5M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.7M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/20.1M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/21.3M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.8M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/22.7M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.9M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.8M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/18.0M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.8M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/21.6M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/17.1M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/21.4M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.4M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/17.6M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/19.6M [00:00<?, ?B/s]

data/validation/20250916_180654_00055_aq(…):   0%|          | 0.00/22.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/20 [00:00<?, ?files/s]

data/test/20250916_180635_00031_zx8g4_39(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_1f(…):   0%|          | 0.00/21.7M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_65(…):   0%|          | 0.00/20.3M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_3d(…):   0%|          | 0.00/19.7M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_a8(…):   0%|          | 0.00/20.6M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_50(…):   0%|          | 0.00/23.1M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_0c(…):   0%|          | 0.00/17.7M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_ad(…):   0%|          | 0.00/22.3M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_53(…):   0%|          | 0.00/20.2M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_3a(…):   0%|          | 0.00/18.1M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_0f(…):   0%|          | 0.00/18.4M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_87(…):   0%|          | 0.00/20.0M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_9d(…):   0%|          | 0.00/21.5M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_2b(…):   0%|          | 0.00/18.3M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_96(…):   0%|          | 0.00/18.5M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_21(…):   0%|          | 0.00/18.9M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_af(…):   0%|          | 0.00/19.3M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_cb(…):   0%|          | 0.00/23.2M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_bd(…):   0%|          | 0.00/21.3M [00:00<?, ?B/s]

data/test/20250916_180635_00031_zx8g4_cf(…):   0%|          | 0.00/21.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 310751
    })
    validation: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 38967
    })
    test: Dataset({
        features: ['id', 'topic_index', 'topic_display_name', 'subfield_index', 'subfield_display_name', 'title', 'abstract', 'fulltext', 'subset'],
        num_rows: 39017
    })
})

In [12]:
def filter_not_null(example, column_name):
    return example[column_name] is not None

In [18]:
dataset_title_subfieldindex = dataset.select_columns(['title', 'subfield_index'])
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_subfieldindex = dataset_title_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
dataset_title_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'title_subfield', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/aa047a70d9ae6b554398ea952595097237c5ab96', commit_message='Upload dataset', commit_description='', oid='aa047a70d9ae6b554398ea952595097237c5ab96', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [19]:
dataset_title_topicindex = dataset.select_columns(['title', 'topic_index'])
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'title'))
dataset_title_topicindex = dataset_title_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_title_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_title_topic', num_proc=8)
dataset_title_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'title_topic', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/d587f52ec5833bbd6eeb11d65dd13a6f72d65f27', commit_message='Upload dataset', commit_description='', oid='d587f52ec5833bbd6eeb11d65dd13a6f72d65f27', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [20]:
dataset_abstract_subfieldindex = dataset.select_columns(['abstract', 'subfield_index'])
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_subfieldindex = dataset_abstract_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
# dataset_abstract_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_subfield', num_proc=8)
dataset_abstract_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'abstract_subfield', num_proc=8)

dataset_abstract_topicindex = dataset.select_columns(['abstract', 'topic_index'])
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'abstract'))
dataset_abstract_topicindex = dataset_abstract_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_abstract_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_abstract_topic', num_proc=8)
dataset_abstract_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'abstract_topic', num_proc=8)

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

  [2m2025-09-16T23:17:46.925451Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220



Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

Uploading the dataset shards (num_proc=8):   0%|          | 0/8 [00:00<?, ? shards/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/6e4e6688498883c6f0b8674a6d481bb9fb74a4b4', commit_message='Upload dataset', commit_description='', oid='6e4e6688498883c6f0b8674a6d481bb9fb74a4b4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [23]:
dataset_fulltext_subfieldindex = dataset.select_columns(['fulltext', 'subfield_index'])
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_subfieldindex = dataset_fulltext_subfieldindex.filter(lambda x: filter_not_null(x, 'subfield_index'))
# dataset_fulltext_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_subfield')
dataset_fulltext_subfieldindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'fulltext_subfield', num_proc=8)

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          |  524kB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  239MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  241MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.68MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  237MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 28.4kB /  235MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  238MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  236MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 30.0kB /  233MB            

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  240MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  206MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  206MB            

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  210MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 3.67MB /  209MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics/commit/8ae1ccebe3b0c5b79be723e3fdc84d9ce341147e', commit_message='Upload dataset', commit_description='', oid='8ae1ccebe3b0c5b79be723e3fdc84d9ce341147e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'), pr_revision=None, pr_num=None)

In [None]:
dataset_fulltext_topicindex = dataset.select_columns(['fulltext', 'topic_index'])
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'fulltext'))
dataset_fulltext_topicindex = dataset_fulltext_topicindex.filter(lambda x: filter_not_null(x, 'topic_index'))
# dataset_fulltext_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+'_fulltext_topic')
dataset_fulltext_topicindex.push_to_hub('SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics', 'fulltext_topic', num_proc=1)

Uploading the dataset shards (num_proc=8):   0%|          | 0/21 [00:00<?, ? shards/s]