# Tokenization Testing

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf

In [None]:
# Model Names: 
# 'bert-base-uncased' (bert)
# 'bert-base-multilingual-cased' (mBERT)
# 'xlm-roberta-base' or "distilroberta-base" (XLM-RoBERTa, Distil Roberta)
# "google-bert/bert-base-cased" (mobileBert)

MODEL_NAME = 'xlm-roberta-base'

In [None]:
texts = "Hello my Dying World im tired"

In [None]:
# Model initialization
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    # hidden_dropout_prob=0.3,
    # attention_probs_dropout_prob=0.15
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function
def tokenize_data(texts, tokenizer):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

## Loading Dataset

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
randnum = 45#42

In [None]:
file_path = 'dataset/finaldataset_6k_shuffled_v2.csv'

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['text', 'label']].dropna()
    df['label'] = df['label'].astype(int)
    return df

In [None]:
df = load_data(file_path)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=randnum)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=randnum)


In [None]:
val_df, test_df

In [None]:
train_encodings = tokenize_data(train_df['text'].tolist(), tokenizer)
val_encodings = tokenize_data(val_df['text'].tolist(), tokenizer)
test_encodings = tokenize_data(test_df['text'].tolist(), tokenizer)

### Testing Tokenizer Values

In [1]:
# Model Names: 
# 'bert-base-uncased' (bert)
# 'bert-base-multilingual-cased' (mBERT)
# 'xlm-roberta-base' or "distilroberta-base" (XLM-RoBERTa, Distil Roberta)
# "google-bert/bert-base-cased" (mobileBert)

MODEL_NAME = 'xlm-roberta-base'

In [2]:
text = "Magtaya nang may saya! Libreng P288 at 100% deposit bonus! Sumali na sa: clickhere.cx/pNTVzioGpzxS2BV at magsimula ng pagwawagi!"

In [32]:
MAX_LENGTH = 150

In [33]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = TFAutoModelForSequenceClassification.from_pretrained(
#     MODEL_NAME,
#     num_labels=2,
#     # hidden_dropout_prob=0.3,
#     # attention_probs_dropout_prob=0.15
# )

In [34]:
# prepare input
encoded_input = tokenizer(
    text, 
    max_length=MAX_LENGTH,
    truncation=True,
    return_tensors='tf'
)

# forward pass
# output = model(**encoded_input)

# print(encoded_input)

In [35]:
num_tokens = encoded_input["input_ids"].shape[1]
print(f"Number of tokens: {num_tokens}")

Number of tokens: 48


In [36]:
decoded_text = tokenizer.decode(encoded_input["input_ids"].numpy()[0], skip_special_tokens=True)
print("--- tokenized text object value:")
print(encoded_input)
print("\n")
print("--- decoded text from tokenized data:") 
print(decoded_text)
print("--- real text:")
print(text)

--- tokenized text object value:
{'input_ids': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[     0,   5695,    102,    395,   8043,   1543,   1216,     38,
         88161,    449,    436, 103064,     99,   3555,  40370,   9396,
            38,  74663,    150,     24,     57,     12,  18158,  35593,
             5,    238,    425,     64,    254,    839,   5343,   6162,
           724,    254,    169,    425,    294,    304,  77394,     99,
          1697, 137003,    234,   2070, 105761,    735,     38,      2]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]], dtype=int32)>}


--- decoded text from tokenized data:
Magtaya nang may saya! Libreng P288 at 100% deposit bonus! Sumali na sa: clickhere.cx/pNTVzioGpzxS2BV at magsimula ng pagwawagi!
--- real text:
Magtaya nang may 