# 1. 데이터 정리

In [None]:
import boto3
s3 = boto3.resource('s3')
bucket = "sagemaker-jhgan-workspace"
my_bucket = s3.Bucket(bucket)

In [None]:
for obj in my_bucket.objects.filter(Prefix="train/img"):
    print(obj.key)
    fname = obj.key.split("/")[1]
    copy_source = {
        "Bucket":bucket,
        "Key":obj.key
    }
    s3.meta.client.copy(copy_source, bucket, f"NII/{fname}")

In [None]:
for obj in my_bucket.objects.filter(Prefix="test_noGT"):
    print(obj.key)
    fname = obj.key.split("/")[1]
    copy_source = {
        "Bucket":bucket,
        "Key":obj.key
    }
    s3.meta.client.copy(copy_source, bucket, f"NII/test_{fname}")

# 2. 전처리

## 2.1. ECR에 도커 이미지 푸시

```bash
aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 349048005035.dkr.ecr.us-east-2.amazonaws.com
docker tag <image name> <repository name>
docker push <repository name>
```


## 2.2. `ScriptProcessor` 실행

- [`Processor` 클래스 Readthedocs](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.processing.ScriptProcessor)
- [Run Scripts with Your Own Processing Container](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-container-run-scripts.html)

In [1]:
import boto3
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker import get_execution_role
from sagemaker.session import Session
s3 = boto3.resource('s3')
BUCKET_NAME = "sagemaker-jhgan-workspace"
DOCKER_IMAGE_URI = "349048005035.dkr.ecr.us-east-2.amazonaws.com/pggan:latest"
VOLUME_SIZE = 200

In [2]:
inputObject = ProcessingInput(
        source=f's3://sagemaker-jhgan-workspace/NII',
        destination=f'/opt/ml/processing/input_data'
)

In [4]:
script_processor = ScriptProcessor(
    image_uri = DOCKER_IMAGE_URI,
    volume_size_in_gb = VOLUME_SIZE,
    role = get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    command = ["python"],
    sagemaker_session=Session(default_bucket = BUCKET_NAME)
)

In [7]:
script_processor.run(
    code = "dataset_tool.py",
    inputs=[inputObject],
    outputs=[ProcessingOutput(source='/opt/ml/processing/processed_data')],
    wait=True,
    job_name= "nii-to-tfrecord-01"
)

Parameter 'session' will be renamed to 'sagemaker_session' in SageMaker Python SDK v2.



Job Name:  nii-to-tfrecord-01
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-jhgan-workspace/NII', 'LocalPath': '/opt/ml/processing/input_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-jhgan-workspace/nii-to-tfrecord-01/input/code/dataset_tool.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-jhgan-workspace/nii-to-tfrecord-01/output/output-1', 'LocalPath': '/opt/ml/processing/processed_data', 'S3UploadMode': 'EndOfJob'}}]
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", n