## Tokenization

### Importing the relevant libraries


In [50]:
from transformers import AutoTokenizer
import datasets
import pickle
from tqdm import tqdm

### Setting up tokenizer

In [51]:
pre_trained_BERTmodel='bert-large-uncased'
BERT_tokenizer=AutoTokenizer.from_pretrained(pre_trained_BERTmodel)

### Loading data

In [52]:
with open('training_datasets.pkl', 'rb') as file:
    training_datasets = pickle.load(file)
with open('val_data.pkl', 'rb') as file:
    val_data = pickle.load(file)
with open('test_data.pkl', 'rb') as file:
    test_data = pickle.load(file)
with open('augmented_datasets.pkl', 'rb') as file:
    augmented_datasets = pickle.load(file)

### Function to tokenize the data


In [53]:
def tokenize_data(example):
    encoded_input = BERT_tokenizer(example["speech_text"], padding="max_length", truncation=True)
    return {"input_ids": encoded_input["input_ids"], "attention_mask": encoded_input["attention_mask"], "labels": example["intent"]}

#### Tokenizing non augmented training data

In [54]:
train_dataset=[]
for train_data_ in training_datasets:
  traindataset = datasets.Dataset.from_pandas(train_data_)
  train_dataset.append(traindataset.map(tokenize_data))

Map: 100%|██████████| 640/640 [00:00<00:00, 3406.86 examples/s]
Map: 100%|██████████| 640/640 [00:00<00:00, 3479.71 examples/s]
Map: 100%|██████████| 640/640 [00:00<00:00, 3542.67 examples/s]


#### Tokenizing augmented training data

In [55]:
augmented_train_dataset=[]
for train_data_ in augmented_datasets:
  traindataset = datasets.Dataset.from_pandas(train_data_)
  augmented_train_dataset.append(traindataset.map(tokenize_data))

Map: 100%|██████████| 640/640 [00:00<00:00, 3465.59 examples/s]
Map: 100%|██████████| 640/640 [00:00<00:00, 3522.57 examples/s]
Map: 100%|██████████| 640/640 [00:00<00:00, 3497.46 examples/s]


#### Tokenizing validation data

In [56]:
val_data = datasets.Dataset.from_pandas(val_data)
val_data = val_data.map(tokenize_data)

Map: 100%|██████████| 993/993 [00:00<00:00, 3460.64 examples/s]


#### Tokenizing test data

In [57]:
testdataset = datasets.Dataset.from_pandas(test_data)
test_dataset = testdataset.map(tokenize_data)

Map: 100%|██████████| 1075/1075 [00:00<00:00, 3522.23 examples/s]


### Store tokenized data

In [58]:
with open('train_dataset_tokenized.pkl', 'wb') as file:
    pickle.dump(train_dataset, file)
with open('val_data_tokenized.pkl', 'wb') as file:
    pickle.dump(val_data, file)
with open('test_data_tokenized.pkl', 'wb') as file:
    pickle.dump(test_dataset, file)
with open('augmented_train_dataset_tokenized.pkl', 'wb') as file:
    pickle.dump(augmented_train_dataset, file)

### Output


This notebook will generate 4 files as mentioned below :

- train_dataset_tokenized.pkl

- val_data_tokenized.pkl

- test_data_tokenized.pkl

- augmented_train_dataset_tokenized.pkl