In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

# load dataset
dataset = load_dataset("imdb")
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

train_dataset = train_dataset.to_pandas()
test_dataset = test_dataset.to_pandas()

In [None]:
# Make sure that texts are encoded in utf-8
train_dataset["text"] = train_dataset["text"].apply(lambda x: x.encode("utf-8"))
test_dataset["text"] = test_dataset["text"].apply(lambda x: x.encode("utf-8"))

In [None]:
# Get smaller sample
sample = 200
small_train_dataset = train_dataset.sample(sample)
small_test_dataset = test_dataset.sample(sample)

In [None]:
import os

os.makedirs("data/small", exist_ok=True)

train_dataset.to_csv("data/train.csv", index=False)
test_dataset.to_csv("data/test.csv", index=False)

small_train_dataset.to_csv("data/small/train.csv", index=False)
small_test_dataset.to_csv("data/small/test.csv", index=False)

In [None]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "imdb/data"
small_prefix = "imdb/data/small"

role = sagemaker.get_execution_role()

In [None]:
sagemaker_session.upload_data("data/train.csv", bucket=bucket, key_prefix=prefix)
sagemaker_session.upload_data("data/test.csv", bucket=bucket, key_prefix=prefix)
sagemaker_session.upload_data(
    "data/small/train.csv", bucket=bucket, key_prefix=small_prefix
)
sagemaker_session.upload_data(
    "data/small/test.csv", bucket=bucket, key_prefix=small_prefix
)

In [None]:
import io

import boto3
import pandas as pd
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()

s3_resource = boto3.resource("s3")

response = s3_resource.Bucket(bucket).Object(key="imdb/data/small/test.csv").get()
data = pd.read_csv(io.BytesIO(response["Body"].read()), encoding="utf-8")