<a href="https://colab.research.google.com/github/harryypham/MyMLPractice/blob/main/torchtext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
import torchtext.transforms as T
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import spacy

In [13]:
from torchtext.datasets import AG_NEWS

spacy_eng = spacy.load("en_core_web_sm")

def eng_tokenize(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

def getTokens(data_iter):
  for label, text in data_iter:
    yield eng_tokenize(text)


train_iter = AG_NEWS(split="train")

source_vocab = build_vocab_from_iterator(
    getTokens(train_iter),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

def getTransform(vocab):
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [14]:
temp_list = list(train_iter)
some_sentence = temp_list[798][1]
print("Some sentence=", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(eng_tokenize(some_sentence))
print("Transformed sentence=", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

Some sentence=Why cyberscofflaws get off easy CNET News.com's Declan McCullagh explains why convicted virus and worm authors are more likely to do Club Fed than hard time.
Transformed sentence=[1, 3714, 3, 242, 114, 1784, 7826, 6382, 25, 42153, 53346, 10077, 1748, 2931, 1547, 12, 2628, 12659, 48, 56, 640, 7, 372, 1874, 1384, 71, 763, 96, 6, 2]
<sos> Why <unk> get off easy CNET News.com 's Declan McCullagh explains why convicted virus and worm authors are more likely to do Club Fed than hard time . <eos> 