In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
# TODO: reorganize

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
#import torch
import lighteval
import numpy as np

['sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_010872c6-1d30-414d-9a62-885c774541c8',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_2b9ae78e-0730-461d-8fc3-8228584affab',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_2dc4d8a0-e03c-4329-b431-38b90b6019f5',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_2eb1e912-14ea-4faf-bccd-9613ca499174',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_34b580a1-0657-4571-acee-77bc1d22dce2',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_3507606d-21f3-46c4-b1a7-d4572a214890',
 'sagemaker-research-methodology-extraction/01_data/03_core/unified_works/20250903_224748_00007_ingyc_3896e412-c136-41df-bb03-86f2d3fab168',
 'sagemaker-r

In [10]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
from huggingface_hub import create_repo, upload_file
from tqdm.auto import tqdm
import fsspec
fs = fsspec.filesystem('s3')  # s3 / gcs / abfs / adl / oci / ...
data_dir = 's3://sagemaker-research-methodology-extraction/01_data/03_core/unified_works/'
pattern = '*'
data_files = fs.glob(data_dir + pattern)
data_files
destination_dataset = 'SteveAKopias/testdataset' # SemanticScholarCSFullTextWithOpenAlexTopics
create_repo(destination_dataset, repo_type='dataset')
for data_file in tqdm(fs.glob(data_dir + pattern)):
    with fs.open(data_file) as fileobj:
        path_in_repo = data_file[len(data_dir):]
        print(path_in_repo)

  0%|          | 0/22 [00:00<?, ?it/s]

903_224748_00007_ingyc_010872c6-1d30-414d-9a62-885c774541c8
903_224748_00007_ingyc_2b9ae78e-0730-461d-8fc3-8228584affab
903_224748_00007_ingyc_2dc4d8a0-e03c-4329-b431-38b90b6019f5
903_224748_00007_ingyc_2eb1e912-14ea-4faf-bccd-9613ca499174
903_224748_00007_ingyc_34b580a1-0657-4571-acee-77bc1d22dce2
903_224748_00007_ingyc_3507606d-21f3-46c4-b1a7-d4572a214890
903_224748_00007_ingyc_3896e412-c136-41df-bb03-86f2d3fab168
903_224748_00007_ingyc_4a65b726-fdea-4a0d-af1a-75e5e4fe387c
903_224748_00007_ingyc_5dd378fc-29d9-40af-8dfd-700cbfe1d32c
903_224748_00007_ingyc_891898fe-7070-4c31-82dc-6e2a6a2ab8aa
903_224748_00007_ingyc_912d4d8f-c14a-4c4f-a76f-6363ad383e0a
903_224748_00007_ingyc_aa08ef98-43ba-40f9-b79c-1eb2132ed35f
903_224748_00007_ingyc_b71caa74-2cc7-4ecf-9865-f031aba1e2ce
903_224748_00007_ingyc_cbae56f4-4c41-49a3-86a2-12a060415250
903_224748_00007_ingyc_d20df9b4-e892-4c55-b6c9-19ff8210bc9a
903_224748_00007_ingyc_db9986cc-7640-4730-9d24-85696f13bd3c
903_224748_00007_ingyc_e01b8ad2-7f08-485

In [4]:
# dataset = load_dataset('shawhin/imdb-truncated')
# dataset