In [None]:
!pip install --upgrade pip
!pip install torch==1.7
!pip install torchtext==0.8.0

In [1]:
import torch
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
seed = 10

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Model Selection

In [None]:
!pip install transformers

In [5]:
from transformers import AutoTokenizer

base_model = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [6]:
len(tokenizer.vocab)

30522

In [9]:
tokens =tokenizer.tokenize('Welcome to MLDevOps Workshop')
print(tokens)

['welcome', 'to', 'ml', '##dev', '##ops', 'workshop']


In [10]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[6160, 2000, 19875, 24844, 11923, 8395]


In [11]:
init_token =tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token,eos_token,pad_token,unk_token)

[CLS] [SEP] [PAD] [UNK]


In [12]:
init_token_idx =tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token,eos_token,pad_token,unk_token)

[CLS] [SEP] [PAD] [UNK]


In [14]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx,eos_token_idx,pad_token_idx,unk_token_idx)

101 102 0 100


In [15]:
max_input_length = tokenizer.max_model_input_sizes[base_model]
print(max_input_length)

512


In [16]:
max_input_length = 10

In [18]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    return tokens

## Data Collation

In [22]:
from torchtext import data

text = data.Field(batch_first = True,
                 use_vocab = False,
                 tokenize = tokenize_and_cut,
                 preprocessing = tokenizer.convert_tokens_to_ids,
                 init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)



In [30]:
from torchtext import datasets

TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)

# make splits for data
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(seed))

In [31]:
train_data

<torchtext.data.dataset.Dataset at 0x7f300b2c29e8>

In [34]:
print(f"Number of training examples:{len(train_data)}")
print(f"Number of validation examples:{len(valid_data)}")
print(f"Number of testing examples:{len(test_data)}")

Number of training examples:17500
Number of validation examples:7500
Number of testing examples:25000


In [44]:
LABEL.build_vocab(train_data)

In [45]:
print(LABEL.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f300b24eba8>>, {'<unk>': 0, 'neg': 1, 'pos': 2})


In [None]:
batch_size = 128

device = torch.device('cuda' id torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BuckettIterator.splits