# Preparing for fine-tunning

## Tokenizing text

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from datasets import load_dataset 

train_data = load_dataset("imdb", split="train") 
train_data = train_data.shard(num_shards=4, index=0) 
test_data = load_dataset("imdb", split="test") 
test_data = test_data.shard(num_shards=4, index=0)

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
tokenized_training_data = tokenizer(str(train_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)
tokenized_test_data = tokenizer(str(test_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)

print(tokenized_training_data)
print(tokenized_test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,  5930,  1006,  1031,  1005,  1045, 12524,  1045,  2572,  8025,
          1011,  3756,  2013,  2026,  2678,  3573,  2138,  1997,  2035,  1996,
          6704,  2008,  5129,  2009,  2043,  2009,  2001,  2034,  2207,  1999,
          3476,  1012,  1045,  2036,  2657,  2008,  2012,  2034,  2009,  2001,
          8243,  2011,  1057,  1012,  1055,  1012,  8205,  2065,  2009,  2412,
          2699,  2000,  4607,  2023,  2406,  1010,  3568,  2108,  1037,  5470,
          1997,  3152,  2641,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Mapping tokenization

In [9]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

tokenized_in_batches = train_data.map(tokenize_function, batched=True)

print(tokenized_in_batches)

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6250
})


In [10]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

# Tokenize row by row
tokenized_by_row = train_data.map(tokenize_function, batched=False)

print(tokenized_by_row)

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6250
})
