In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

df = pd.read_csv('dataset.csv')

X = df['comment']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y , random_state=42,  test_size=0.2,  shuffle=True) 

with open('corpus.txt', 'w') as f:
    corpus = ' '.join(X_train)
    f.write(corpus)

In [2]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    Regex
)

In [3]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [4]:
emoji_pattern = Regex('[\U00010000-\U0010ffff]')
special_pattern = Regex("[^a-zA-Z\s]")

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Lowercase(),
     normalizers.NFD(),
     normalizers.StripAccents(),
     normalizers.Replace(emoji_pattern, ''),
     normalizers.Replace(special_pattern, '')
    ]
) 

In [5]:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [6]:
special_tokens = [ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

In [9]:
trainer = trainers.WordPieceTrainer(min_frequency=5, special_tokens=special_tokens, vocab_size=15000)

In [10]:
tokenizer.train(["corpus.txt"], trainer=trainer)

In [11]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [12]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)


In [13]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [14]:
tokenizer.enable_padding(length=250)
tokenizer.enable_truncation(max_length=250)

In [15]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [16]:
X_train_tokens = wrapped_tokenizer(list(X_train), padding=True, truncation=True, max_length=250)
X_test_tokens = wrapped_tokenizer(list(X_test), padding=True, truncation=True, max_length=250)

In [17]:
X_train_tokens = np.array(X_train_tokens['input_ids'])
X_test_tokens = np.array(X_test_tokens['input_ids'])

In [18]:
X_train_tokens.shape, X_test_tokens.shape

((244283, 250), (61071, 250))

In [20]:
from tensorflow.keras.utils import to_categorical

In [21]:
y_train_tokens = to_categorical(np.array(y_train.replace({'irrelevant': 0, 'doubt': 1, 'feedbak':2})))
y_test_tokens = to_categorical(np.array(y_test.replace({'irrelevant': 0, 'doubt': 1, 'feedbak':2})))

In [22]:
y_train_tokens.shape, y_test_tokens.shape

((244283, 3), (61071, 3))

In [23]:
np.save('X_train.npy', X_train_tokens)
np.save('X_test.npy', X_test_tokens)
np.save('y_train.npy', y_train_tokens)
np.save('y_test.npy', y_test_tokens)