In [1]:
import numpy as np
import pandas as pd

import torch
import transformers

import tqdm

In [2]:
df = pd.read_csv('bbc-text.csv')

In [3]:
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
example_text = df.iloc[0]['text']
example_output = tokenizer(example_text, padding='max_length', max_length = 64, truncation=True, return_tensors="pt")

In [6]:
example_output

{'input_ids': tensor([[  101,  2694,  2925,  1999,  1996,  2398,  1997,  7193,  2007,  2188,
          3004,  3001, 12123,  2152,  1011,  6210,  2694,  2015,  1998,  3617,
          2678, 14520,  2015,  3048,  2046,  1996,  2542,  2282,  1996,  2126,
          2111,  3422,  2694,  2097,  2022, 25796,  2367,  1999,  2274,  2086,
          2051,  1012,  2008,  2003,  2429,  2000,  2019,  6739,  5997,  2029,
          5935,  2012,  1996,  3296,  7325,  8139,  2265,  1999,  5869,  7136,
          2000,  6848,  2129,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [7]:
len(example_text.split())

737

In [8]:
df_train = df.sample(100)

# train_text[i] is a dict with 
# keys = dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
# and each value is a tensor of shape [1, max_length]
train_text = [
    tokenizer(text, padding='max_length', max_length = 64, truncation=True, return_tensors="pt") 
    for text in df_train['text']]

In [9]:
train_text[12].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
train_text[12]['input_ids'].shape

torch.Size([1, 64])

In [11]:
LABELS = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4}

In [12]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, xcol='text', ycol='category', max_length=128):

        self.labels = [LABELS[label] for label in df[ycol]]
        self.texts = [
            tokenizer(
                text, 
                padding='max_length', 
                max_length=max_length, 
                truncation=True, 
                return_tensors="pt") 
            for text in df[xcol]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [13]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

len(df_train),len(df_val), len(df_test)

(1780, 222, 223)

In [14]:
train_dataset = CustomDataset(df_train, max_length=64)
val_dataset = CustomDataset(df_val, max_length=64)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

In [15]:
for i, (X, y) in enumerate(tqdm.tqdm(train_dataloader)):
    if i == 1:
        print(f"Batch {i+1}")
        print(X['input_ids'].shape, y.shape)

100%|████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 2153.98it/s]

Batch 2
torch.Size([32, 1, 64]) torch.Size([32])





In [16]:
X['input_ids'].shape, X['input_ids'].squeeze(1), X['input_ids'].squeeze(1).shape

(torch.Size([20, 1, 64]),
 tensor([[  101,  3493,  3926,  ...,  1011,  1014,   102],
         [  101,  2647,  3101,  ...,  2836,  2006,   102],
         [  101, 12849, 12083,  ...,  5937,  7164,   102],
         ...,
         [  101,  2829,  4455,  ...,  2011,  2262,   102],
         [  101, 17117,  5150,  ...,  4216,  2409,   102],
         [  101, 11691,  9860,  ..., 13411,  3410,   102]]),
 torch.Size([20, 64]))

In [17]:
y

tensor([2, 2, 2, 0, 3, 4, 0, 0, 0, 3, 4, 0, 0, 0, 3, 4, 4, 4, 4, 2],
       dtype=torch.int32)

In [18]:
# num of batches
len(df_train)/32, len(df_train)

(55.625, 1780)