## Tokenization

### Importing the relevant libraries


In [20]:
from transformers import AutoTokenizer
import datasets
import pickle
from tqdm import tqdm

### Setting up tokenizer

In [21]:
pre_trained_BERTmodel='bert-large-uncased'
BERT_tokenizer=AutoTokenizer.from_pretrained(pre_trained_BERTmodel)

### Loading data

In [22]:
with open('training_datasets.pkl', 'rb') as file:
    training_datasets = pickle.load(file)
with open('val_data.pkl', 'rb') as file:
    val_data = pickle.load(file)
with open('test_data.pkl', 'rb') as file:
    test_data = pickle.load(file)
with open('augmented_datasets.pkl', 'rb') as file:
    augmented_datasets = pickle.load(file)
with open('train_data_full.pkl', 'rb') as file:
    train_data_full = pickle.load(file)

### Function to tokenize the data


In [23]:
def tokenize_data(example):
    encoded_input = BERT_tokenizer(example["speech_text"], padding="max_length", truncation=True)
    return {"input_ids": encoded_input["input_ids"], "attention_mask": encoded_input["attention_mask"], "labels": example["intent"]}

#### Tokenizing non augmented training data

In [24]:
train_dataset=[]
for train_data_ in training_datasets:
  traindataset = datasets.Dataset.from_pandas(train_data_)
  train_dataset.append(traindataset.map(tokenize_data))

Map: 100%|██████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 3488.59 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 3632.38 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 3693.89 examples/s]


#### Tokenizing augmented training data

In [25]:
augmented_train_dataset=[]
for train_data_ in augmented_datasets:
  traindataset = datasets.Dataset.from_pandas(train_data_)
  augmented_train_dataset.append(traindataset.map(tokenize_data))

Map: 100%|████████████████████████████████████████████████████████████████████████| 1280/1280 [00:00<00:00, 3566.96 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████| 1280/1280 [00:00<00:00, 3610.75 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████| 1280/1280 [00:00<00:00, 3578.31 examples/s]


#### Tokenizing validation data

In [26]:
val_data = datasets.Dataset.from_pandas(val_data)
val_data = val_data.map(tokenize_data)

Map: 100%|██████████████████████████████████████████████████████████████████████████| 993/993 [00:00<00:00, 3587.75 examples/s]


#### Tokenizing test data

In [27]:
testdataset = datasets.Dataset.from_pandas(test_data)
test_dataset = testdataset.map(tokenize_data)

Map: 100%|████████████████████████████████████████████████████████████████████████| 1075/1075 [00:00<00:00, 3573.40 examples/s]


#### Tokenize full train dataset

In [28]:
train_data_full = datasets.Dataset.from_pandas(train_data_full)
train_data_full = train_data_full.map(tokenize_data)

Map: 100%|████████████████████████████████████████████████████████████████████████| 9927/9927 [00:02<00:00, 3578.87 examples/s]


### Store tokenized data

In [29]:
with open('train_dataset_tokenized.pkl', 'wb') as file:
    pickle.dump(train_dataset, file)
with open('val_data_tokenized.pkl', 'wb') as file:
    pickle.dump(val_data, file)
with open('test_data_tokenized.pkl', 'wb') as file:
    pickle.dump(test_dataset, file)
with open('augmented_train_dataset_tokenized.pkl', 'wb') as file:
    pickle.dump(augmented_train_dataset, file)
with open('train_dataset_full_tokenized.pkl', 'wb') as file:
    pickle.dump(train_data_full, file)

### Output


This notebook will generate 5 files as mentioned below :

- train_dataset_tokenized.pkl

- val_data_tokenized.pkl

- test_data_tokenized.pkl

- augmented_train_dataset_tokenized.pkl

- train_dataset_full_tokenized.pkl