In [1]:
import torch

from datasets import load_dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

using device: cuda


In [3]:
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:50]),
        'label': example['label']
    }

In [4]:
imdb_dataset = load_dataset('imdb')
small_imdb_dataset = DatasetDict(
    train=imdb_dataset['train'].shuffle(seed=1111).select(range(128)).map(truncate),
    val=imdb_dataset['train'].shuffle(seed=1111).select(range(128, 160)).map(truncate)
)

In [5]:
print(small_imdb_dataset['train'][:10])

{'text': ["Probably Jackie Chan's best film in the 1980s, and the one that put him on the map. The scale of this self-directed police drama is evident from the opening and closing scenes, during which a squatters' village and shopping mall are demolished. There are, clearly, differences between the original Chinese", 'A wonderful movie! Anyone growing up in an Italian family will definitely see themselves in these characters. A good family movie with sadness, humor, and very good acting from all. You will enjoy this movie!! We need more like it.', 'HORRENDOUS! Avoid like the plague. I would rate this in the top 10 worst movies ever. Special effects, acting, mood, sound, etc. appear to be done by day care students...wait, I have seen programs better than this. Opens like a soft porn show with a blurred nude female doing a', 'And I absolutely adore Isabelle Blais!!! She was so cute in this movie, and far different from her role in "Quebec-Montreal" where she was more like a man-eater. I 

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint =  'siebert/sentiment-roberta-large-english'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# prepares the dataset - this tokenizes the dataset in batches of 16 examples.
small_tokenized_dataset = small_imdb_dataset.map(lambda x: tokenizer(x['text'], padding=True, truncation=True),
                       batched=True,
                       batch_size=16
                      )

small_tokenized_dataset = small_tokenized_dataset.remove_columns(['text'])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", 'labels')
small_tokenized_dataset.set_format('torch')

small_tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 128
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 32
    })
})

In [7]:
small_tokenized_dataset['train'][:2]

{'labels': tensor([1, 1]),
 'input_ids': tensor([[    0, 35882, 12585,  8710,    18,   275,   822,    11,     5,  5114,
             29,     6,     8,     5,    65,    14,   342,   123,    15,     5,
           5456,     4,    20,  3189,     9,    42,  1403,    12, 25706,   249,
           4149,    16, 10180,    31,     5,  1273,     8,  3172,  5422,     6,
            148,    61,    10, 31147,  2696,   108,  3375,     8,  3482,  9367,
             32, 20766,     4,   345,    32,     6,  2563,     6,  5550,   227,
              5,  1461,  1111,     2,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1],
         [    0,   250,  4613,  1569,   328,  6142,  1197,    62,    11,    41,
           3108,   284,    40,  2299,   192,  1235,    11,   209,  3768,     4,
             83,   205,   284,  1569,    19, 17437,     6, 12073,     6,     8,
            182,   2

In [8]:
from torch.utils.data import DataLoader


train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

In [None]:
# training