In [1]:
import warnings
from dataset import IMDBDataset
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, random_split
warnings.filterwarnings("ignore")

In [2]:
# load data
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


# 데이터셋 변환
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = IMDBDataset(tokenized_datasets["train"])
test_dataset = IMDBDataset(tokenized_datasets["test"])

train_size = int(0.8 * len(train_dataset))  # 80% for training
val_size = len(train_dataset) - train_size  # 20% for validation
train_data, val_data = random_split(train_dataset, [train_size, val_size])

batch_size = 16
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [3]:
next(iter(train_dataloader))[0].shape

torch.Size([16, 128])

In [4]:
next(iter(train_dataloader))[0]

tensor([[  101, 24905, 17988,  ...,     0,     0,     0],
        [  101,  3437,  1024,  ...,  2084,  1996,   102],
        [  101,  5954,  2003,  ..., 24324,  2007,   102],
        ...,
        [  101,  2129,  2515,  ...,  2035,  1996,   102],
        [  101,  1996,  5436,  ...,  2038,  2196,   102],
        [  101,  2070,  2111,  ...,  4103,  1005,   102]])

In [5]:
next(iter(train_dataloader))[1].shape

torch.Size([16, 128])

In [6]:
next(iter(train_dataloader))[1]

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [7]:
next(iter(train_dataloader))[2].shape

torch.Size([16])

In [8]:
next(iter(train_dataloader))[2]

tensor([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1])