In [9]:
import torch
import pandas as pd
import os
import pickle

from local_utils.data_utils import set_seed, make_path
from local_utils.tokenizer_utils import init_tokenizer, batch_encode



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


<h3>Importing datasets into dataframes</h3>

In [3]:
train_data = pd.read_csv('./Dataset/traindata_protov2.csv')
valid_data = pd.read_csv('./Dataset/testdata_protov2.csv')

num_classes_train = len(train_data["BROWSE_NODE_ID"].value_counts())
num_classes_valid = len(valid_data["BROWSE_NODE_ID"].value_counts())

print(f"\ntrain dataset:\nno of records : {len(train_data)}, no of classes : {num_classes_train}")
print(f"\nvalid dataset:\nno of records : {len(valid_data)}, no of classes : {num_classes_valid}")
print(f"\nvalid % : {len(valid_data)/len(train_data)}")

train_texts = train_data["text"].values
train_labels = train_data["BROWSE_NODE_ID"].values

valid_texts = valid_data["text"].values
valid_labels = valid_data["BROWSE_NODE_ID"].values

num_labels = len(train_data["BROWSE_NODE_ID"].unique())



train dataset:
no of records : 1004773, no of classes : 250

valid dataset:
no of records : 6248, no of classes : 250

valid % : 0.006218319958836474


<h3> get tokenizer chars and select tokenizer as below </h3>
<ul>
    <li>"Electra" : (ElectraTokenizerFast,'google/electra-base-discriminator')</li>
    <li>"Bert" : (BertTokenizerFast,'bert-base-uncased')</li>
    <li>"Deberta" : (DebertaTokenizerFast,'microsoft/deberta-base')</li>
    <li>"DistilBert" : (DistilBertTokenizerFast,'distilbert-base-uncased')</li>
    <li>"Roberta" : (RobertaTokenizerFast, 'roberta-base')</li>
</ul>    

<h3> Selecting and initializing tokenizer </h3>

In [4]:
tokenizer_name = "Deberta"

tokenizer = init_tokenizer(tokenizer_name)

print(tokenizer)

DebertaTokenizerFast(name_or_path='microsoft/deberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50264: AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}


<h3> initiating tokenization  </h3>

In [5]:
input_ids_train, attention_masks_train = batch_encode(list(train_texts),
                                                      tokenizer,
                                                      max_seq_len = 64,
                                                      batch_size = 5000)
input_ids_valid, attention_masks_valid = batch_encode(list(valid_texts),
                                                      tokenizer,
                                                      max_seq_len = 64,
                                                      batch_size = 5000)

100%|██████████| 201/201 [02:51<00:00,  1.17it/s]
100%|██████████| 2/2 [00:01<00:00,  1.72it/s]


<h3>Saving Tokenizer</h3>

In [7]:
tokenizer_files_path = './local_utils/tokenizer_files/'

make_path(tokenizer_files_path)
tokenizer.save_pretrained(tokenizer_files_path)

('./local_utils/tokenizer_files/tokenizer_config.json',
 './local_utils/tokenizer_files/special_tokens_map.json',
 './local_utils/tokenizer_files/vocab.json',
 './local_utils/tokenizer_files/merges.txt',
 './local_utils/tokenizer_files/added_tokens.json',
 './local_utils/tokenizer_files/tokenizer.json')

<h3>saving and exporting tokenized dataset for training in the pickle format for efficient import and export</h3>

In [11]:
token_encoding_train = {
"input_ids" : input_ids_train,
"attention_masks" : attention_masks_train,
"labels" : train_labels

}

token_encoding_valid = {
    "input_ids" : input_ids_valid,
    "attention_masks" : attention_masks_valid,
    "labels" : valid_labels 
    
}

SAVE_NAME_TRAIN = f'./Dataset/token_{tokenizer_name}_train_proto.pkl'
SAVE_NAME_VALID = f'./Dataset/token_{tokenizer_name}_valid_proto.pkl'

with open(SAVE_NAME_TRAIN, 'wb') as f:
    pickle.dump(token_encoding_train, f)
    
with open(SAVE_NAME_VALID, 'wb') as f:
    pickle.dump(token_encoding_valid, f)
            