In [1]:
import numpy as np
import pandas as pd
import torch
import transformers

In [2]:
from src import custom_label, custom_dataset

In [3]:
BERT = 'bert-base-uncased'

BATCH_SIZE = 32
MAX_LENGTH = 2**10

TEXT_COL = 'text'
LABEL_COL = 'category'

In [4]:
df = pd.read_csv('data/bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
label_encoder = custom_label.CustomLabelEncoder(df[LABEL_COL])

In [6]:
df = custom_label.encode_labels(
    label_encoder=label_encoder, 
    df=df, 
    label_col=LABEL_COL)
df.head()

Unnamed: 0,category,text
0,4,tv future in the hands of viewers with home th...
1,0,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,3,yeading face newcastle in fa cup premiership s...
4,1,ocean s twelve raids box office ocean s twelve...


### Look at tokenization on a single string

In [7]:
tokenizer = transformers.BertTokenizer.from_pretrained(BERT)

In [8]:
example_text = df.iloc[0][TEXT_COL]
example_output = tokenizer(
    example_text, 
    padding='max_length', 
    max_length=MAX_LENGTH, 
    truncation=True, 
    return_tensors="pt")

example_output.keys(), example_output, example_output['input_ids'].shape

(dict_keys(['input_ids', 'token_type_ids', 'attention_mask']),
 {'input_ids': tensor([[ 101, 2694, 2925,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])},
 torch.Size([1, 1024]))

### Look at tokenization of a batch (batches) of strings

In [9]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

len(df_train),len(df_val), len(df_test)

(1780, 222, 223)

In [10]:
f"There will be about {np.ceil(len(df_test)/BATCH_SIZE)} batches of test data; each batch is of {BATCH_SIZE} texts."

'There will be about 7.0 batches of test data; each batch is of 32 texts.'

In [11]:
test_dataset = custom_dataset.CustomDataset(
    tokenizer, df_test, TEXT_COL, LABEL_COL, max_length=MAX_LENGTH)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [12]:
len(test_dataset), len(test_dataloader)

(223, 7)

In [13]:
test_iter = iter(test_dataloader)

X, y = test_iter.next()
print(X['input_ids'].shape, y.shape)
X, y

torch.Size([32, 1, 1024]) torch.Size([32])


({'input_ids': tensor([[[  101,  9590,  7559,  ...,     0,     0,     0]],
  
          [[  101,  4186,  5081,  ...,     0,     0,     0]],
  
          [[  101, 11006,  4520,  ...,     0,     0,     0]],
  
          ...,
  
          [[  101,  4517,  2303,  ...,     0,     0,     0]],
  
          [[  101,  9106, 10768,  ...,     0,     0,     0]],
  
          [[  101,  7206, 19311,  ...,     0,     0,     0]]]),
  'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          ...,
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]]]),
  'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          ...,
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]]])},
 tensor([1, 3, 3,

In [14]:
X, y = test_iter.next()
print(X['input_ids'].shape, y.shape)
X, y

torch.Size([32, 1, 1024]) torch.Size([32])


({'input_ids': tensor([[[  101,  1047, 24412,  ...,     0,     0,     0]],
  
          [[  101, 11865,  6562,  ...,     0,     0,     0]],
  
          [[  101,  2149, 21358,  ...,     0,     0,     0]],
  
          ...,
  
          [[  101,  2634,  2373,  ...,     0,     0,     0]],
  
          [[  101,  1056,  1011,  ...,     0,     0,     0]],
  
          [[  101,  8675, 23102,  ...,     0,     0,     0]]]),
  'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          ...,
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]],
  
          [[0, 0, 0,  ..., 0, 0, 0]]]),
  'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          ...,
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]],
  
          [[1, 1, 1,  ..., 0, 0, 0]]])},
 tensor([3, 3, 1,