# Chapter 2: Text Classification - Tokenization

## Library Imports

In [16]:
import pandas as pd
import numpy as np
import matplotlib as plt


from transformers import AutoTokenizer, DistilBertTokenizer

from datasets import list_datasets, load_dataset

## Getting A Dataset

In [18]:

# List the first 10 datasets
list_datasets()[:10]

emotions = load_dataset('emotion')
emotions

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

No config specified, defaulting to: emotion/split


Downloading and preparing dataset emotion/split to /Users/joshuaeason/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /Users/joshuaeason/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [20]:
# Create a training dataset...
train = emotions['train']
# first row...
train[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [21]:
# look at the features of the dataset
train.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [29]:
# look at just the text of the dataset
print(f"The text of the first row is: {train['text'][0]}")
print(f"The emotion label of the first row is: {train['label'][0]}")
print(f"The corresponding english of label {train['label'][0]} is: {train.features['label'].names[train['label'][0]]}")

The text of the first row is: i didnt feel humiliated
The emotion label of the first row is: 0
The corresponding english of label 0 is: sadness


In [31]:
# create vector of english labels... this is very slow...
en_label = [train.features['label'].names[train['label'][i]] for i in range(len(train['text']))]
len(en_label)

16000

In [39]:
# Create dataframe from emotions dataset
emotions.set_format(type='pandas')
df = emotions['train'][:]
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [40]:
en_labels = train.features['label'].names
def convert_column(row, lookup):
    return lookup[row['label']]

def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)

In [41]:
df_custom = df.copy()
df_stock = df.copy()

In [44]:
%%timeit
df_custom['en_label'] = df.apply(lambda row: convert_column(row, en_labels), axis=1)

40.1 ms ± 433 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [45]:
%%timeit
df_stock['en_label'] = df_stock['label'].apply(label_int2str)

11.6 ms ± 64.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [51]:
emotions.reset_format()

## Tokenization

### Character Tokenization

In [46]:
text = "This text has been tokenized based on a basic implementation of mapping characters to integer values uniquely in the text"
tokenized_text = list(text)
print(tokenized_text)

['T', 'h', 'i', 's', ' ', 't', 'e', 'x', 't', ' ', 'h', 'a', 's', ' ', 'b', 'e', 'e', 'n', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'd', ' ', 'b', 'a', 's', 'e', 'd', ' ', 'o', 'n', ' ', 'a', ' ', 'b', 'a', 's', 'i', 'c', ' ', 'i', 'm', 'p', 'l', 'e', 'm', 'e', 'n', 't', 'a', 't', 'i', 'o', 'n', ' ', 'o', 'f', ' ', 'm', 'a', 'p', 'p', 'i', 'n', 'g', ' ', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r', 's', ' ', 't', 'o', ' ', 'i', 'n', 't', 'e', 'g', 'e', 'r', ' ', 'v', 'a', 'l', 'u', 'e', 's', ' ', 'u', 'n', 'i', 'q', 'u', 'e', 'l', 'y', ' ', 'i', 'n', ' ', 't', 'h', 'e', ' ', 't', 'e', 'x', 't']


In [49]:
# Step 1: get the unique set of characters
tok_set = set(tokenized_text)
# Step 2: sort the set
sorted_tok_set = sorted(tok_set)
# Step 3: create a unique mapping via a dict comprehension
tok_to_idx_dict = {ch: idx for idx,ch in enumerate(sorted_tok_set)}
# Step 4: create the tokenization from char to int by replacing each char with its int in the mapping
final_tok = [tok_to_idx_dict[tok] for tok in tokenized_text]
final_tok

[1,
 9,
 10,
 19,
 0,
 20,
 6,
 23,
 20,
 0,
 9,
 2,
 19,
 0,
 3,
 6,
 6,
 14,
 0,
 20,
 15,
 11,
 6,
 14,
 10,
 25,
 6,
 5,
 0,
 3,
 2,
 19,
 6,
 5,
 0,
 15,
 14,
 0,
 2,
 0,
 3,
 2,
 19,
 10,
 4,
 0,
 10,
 13,
 16,
 12,
 6,
 13,
 6,
 14,
 20,
 2,
 20,
 10,
 15,
 14,
 0,
 15,
 7,
 0,
 13,
 2,
 16,
 16,
 10,
 14,
 8,
 0,
 4,
 9,
 2,
 18,
 2,
 4,
 20,
 6,
 18,
 19,
 0,
 20,
 15,
 0,
 10,
 14,
 20,
 6,
 8,
 6,
 18,
 0,
 22,
 2,
 12,
 21,
 6,
 19,
 0,
 21,
 14,
 10,
 17,
 21,
 6,
 12,
 24,
 0,
 10,
 14,
 0,
 20,
 9,
 6,
 0,
 20,
 6,
 23,
 20]

### Word Tokenization

In [50]:
text = 'This text has been tokenized using the same procedure as with character tokenization, except on the per-word level'
tokenized_text = text.split()
tok_set = set(tokenized_text)
sorted_tok_set = sorted(tok_set)
tok_to_idx_dict = {word: idx for idx,word in enumerate(sorted_tok_set)}
final_tok = [tok_to_idx_dict[tok] for tok in tokenized_text]
final_tok

[0, 11, 5, 2, 14, 15, 12, 10, 9, 1, 16, 3, 13, 4, 7, 12, 8, 6]

### Subword Tokenization

In [4]:
model_check = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_check)

In [5]:
text = 'This text has been tokenized using a pretrained model from HuggingFace'
encoded = tokenizer(text)
encoded

{'input_ids': [101, 2023, 3793, 2038, 2042, 19204, 3550, 2478, 1037, 3653, 23654, 2098, 2944, 2013, 17662, 12172, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
tokens = tokenizer.convert_ids_to_tokens(encoded.input_ids)
tokens

['[CLS]',
 'this',
 'text',
 'has',
 'been',
 'token',
 '##ized',
 'using',
 'a',
 'pre',
 '##train',
 '##ed',
 'model',
 'from',
 'hugging',
 '##face',
 '[SEP]']

In [7]:
words = tokenizer.convert_tokens_to_string(tokens)
words

'[CLS] this text has been tokenized using a pretrained model from huggingface [SEP]'

In [8]:
tokenizer.vocab_size

30522

### Tokenize Entire Dataset

In [53]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [54]:
# Tokenize just two rows from the training dataset
print(tokenize(emotions['train'][:2]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [55]:
# Tokenize the entire dataset in one large batch
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [59]:
# This is the first row
print(emotions_encoded['train'][0])

{'text': 'i didnt feel humiliated', 'label': 0, 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [58]:
# This shows the encoded dataset with new columns...
print(emotions_encoded['train'].column_names)

['text', 'label', 'input_ids', 'attention_mask']
