## Importing Libraries

In [16]:
import re
import string
from typing import List

from unidecode import unidecode

import datasets
from datasets import load_dataset

import tensorflow as tf

## Loading Data

In [4]:
raw_train_dataset, test_dataset = load_dataset('maritaca-ai/imdb_pt', split=['train', 'test'])

train_validation_dataset = raw_train_dataset.train_test_split(0.2)

Found cached dataset imdb_pt (/home/codespace/.cache/huggingface/datasets/maritaca-ai___imdb_pt/plain_text/1.0.0/ffeeea85dfdf69a15638f37cdae931de451276b0fbc70c92f7cad159b96abb05)
100%|██████████| 2/2 [00:00<00:00,  3.12it/s]


In [5]:
train_dataset = train_validation_dataset['train']
validation_dataset = train_validation_dataset['test']

## Processing

In [45]:
VOCAB_SIZE = 5000
MAX_TOKENS = 256

class DataPipeline:
    def __init__(self) -> None:
        self.vectorizer_layer = tf.keras.layers.TextVectorization(
            max_tokens=VOCAB_SIZE,
            standardize=self.compose_corpus_tensor,
            split='whitespace',
            ngrams=None,
            output_mode='int',
            output_sequence_length=MAX_TOKENS,
            pad_to_max_tokens=True,
            encoding='utf-8',
        )

    def fit_transform(self, dataset: datasets.arrow_dataset.Dataset):
        ds = dataset.map(self.decode_text)
        corpus_tensor = self.compose_corpus_tensor(ds['text'])
        self.vectorizer_layer.adapt(corpus_tensor)
        return self.vectorizer_layer(corpus_tensor), ds['label']

    def transform(self, dataset: datasets.arrow_dataset.Dataset):
        ds = dataset.map(self.decode_text)
        corpus_tensor = self.compose_corpus_tensor(ds['text'])
        return self.vectorizer_layer(corpus_tensor), ds['label']

    @staticmethod
    def decode_text(dataset_row):
        dataset_row['text'] = unidecode(dataset_row['text'])
        return dataset_row

    @staticmethod
    def standardize_text(input_data):
        lowercase = tf.strings.lower(input_data)
        standardized = tf.strings.regex_replace(
            lowercase, 
            '[%s]' % re.escape(string.punctuation),
            ''
        )
        return standardized

    @staticmethod
    def compose_corpus_tensor(corpus):
        corpus_tensor = tf.expand_dims(corpus, -1)
        return corpus_tensor

In [42]:
data_pipeline = DataPipeline() 

In [43]:
data_pipeline.fit_transform(test_dataset)

Loading cached processed dataset at /home/codespace/.cache/huggingface/datasets/maritaca-ai___imdb_pt/plain_text/1.0.0/ffeeea85dfdf69a15638f37cdae931de451276b0fbc70c92f7cad159b96abb05/cache-95769e2e34def51b.arrow


(<tf.Tensor: shape=(5000, 1, 256), dtype=int64, numpy=
 array([[[ 163,  153,  228, ...,    0,    0,    0]],
 
        [[   1,    1, 2496, ...,    0,    0,    0]],
 
        [[2080,   12,   13, ...,    0,    0,    0]],
 
        ...,
 
        [[   1,  341,    1, ...,    0,    0,    0]],
 
        [[  38, 2361,  313, ...,   34,  161,    1]],
 
        [[   1,    6, 3035, ...,    0,    0,    0]]])>,
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [44]:
_39[0][0]

<tf.Tensor: shape=(1, 256), dtype=int64, numpy=
array([[ 163,  153,  228,   84,  196,   40,    1,    1,    1,    1,  336,
           1,    1,    1,   10,    1,  119, 1020,    1,    2,   69,    1,
           1,    1,  284,  288,  294,    7,   96,  264,    3,  492,    1,
           2,  342,    3,   98,  417,    1,    2,  742,    1,    1,  597,
           1,   19, 1523,   27,    2,    1,   19,   76,    7,  219, 4016,
          14,   55, 2341,   36,  127,  273,    1,   90,   62,    2,    1,
          42,   15,   24,   75, 2977,    6,    8, 2846,    1,    1,    1,
          20,    4,   62,    1,   41,   49,  709,  821,   11,    1,    5,
         373, 1310,    2,    5,  236,    2,   11, 2399,    7,  259,   14,
        1529,   29,    1,    5,    4,   16, 2398,  629,   39,  776,    1,
        1669,  558,   17,  256, 1848,  139, 2810,   20,    7,  102,    2,
          77,    1,  137,  893,   20,    1,   38,   58,  599,   23,    1,
         147,    1,  127, 3126,  191,   62, 2944,   52,    1,   

In [62]:
def tokenize(text: str) -> List[str]:
    """Clean, tokenize and lemmatize Portuguese text.
    
    Strips special characters, extra spaces 
    and transform to lowercase. 
    Tokenize and lemmatize the tokens

    Args:
      textstring:
        plain text string.

    Returns:
      List of lemmatized tokens of the comment.
    """

    # Transform to lower case
    text = text.lower()

    # Remove line breaks
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    
    # Strip extra white spaces
    text = text.strip()
    text = re.sub(' +', ' ', text)

    doc = nlp(text)

    tokens = []
    for token in doc:
      token = token.lemma_
      token = unidecode(token)
      if token.isalpha():
        tokens.append(token)

    return tokens 

In [8]:
class DataPipeline:
    def __init__(self) -> None:
        self.word_dictionary = None
        self.document_tokens = None

    def fit(self, train_data: list) -> None:
        pass

    def transform(self) -> None:
        pass

    def fit_transform(self) -> None:
        pass

In [45]:
inverse_vocab = {index: token for token, index in vectorizer.vocabulary_.items()}

In [90]:
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=5000,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=256,
    pad_to_max_tokens=256,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
)


In [92]:
vectorizer.adapt(tokenize(train_data[0]))

In [57]:
import string

In [143]:
def tokenize(text: str) -> List[str]:
    """Clean, tokenize and lemmatize Portuguese text.
    
    Strips special characters, extra spaces 
    and transform to lowercase. 
    Tokenize and lemmatize the tokens

    Args:
      textstring:
        plain text string.

    Returns:
      List of lemmatized tokens of the comment.
    """

    # Transform to lower case
    text = text.lower()

    # Remove line breaks
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', ' ', text)
    
    # Strip extra white spaces
    text = text.strip()
    text = re.sub(' +', ' ', text)

    doc = nlp(text)

    tokens = []
    for token in doc:
      token = token.lemma_
      token = unidecode(token)
      if token.isalpha():
        tokens.append(token)

    return tokens 

In [135]:
def standardize_text(input_data):
  lowercase = tf.strings.lower(input_data)
  standardized = tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation),'')
  
  return standardized

def compose_corpus_tensor(corpus):
    corpus_tensor = tf.expand_dims(corpus, -1)
    return corpus_tensor

In [136]:
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=5000,
    standardize=standardize_text,
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=256,
    pad_to_max_tokens=256,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
)

In [125]:
decoded_string = unidecode('TensorFlow é uma biblioteca de código aberto para aprendizado de máquina aplicável a uma ampla variedade de tarefas.')

In [123]:
my_tensor = compose_tensor('TensorFlow é uma biblioteca de código aberto para aprendizado de máquina aplicável a uma ampla variedade de tarefas.')

In [144]:
tokenize(decoded_string)

['tensorflow',
 'e',
 'um',
 'biblioteca',
 'de',
 'codigo',
 'abrir',
 'para',
 'aprendizado',
 'de',
 'maquina',
 'aplicavel',
 'a',
 'um',
 'ampla',
 'variedade',
 'de',
 'tarefa']

In [148]:
compose_tensor(train_data[:2])

<tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'Filme de orienta\xc3\xa7\xc3\xa3o familiar limpa. Eu ri, chorei... adorei. Eu estava preocupado que n\xc3\xa3o seria capaz de ver Steve Carrell como qualquer coisa, exceto Michael, do escrit\xc3\xb3rio. Rapaz, eu estava errado. Ele deve ganhar um Oscar por sua performance. Definitivamente vou comprar isso em DVD quando for lan\xc3\xa7ado. Meu marido gostou e ele n\xc3\xa3o gosta de filmes desse "tipo". Eu vi isso com outros 2 casais na faixa de 30 anos e todos concordamos que era o melhor filme que vimos h\xc3\xa1 muito tempo e certamente o mais limpo. Apenas 1 palavra de xadrez!N\xc3\xa3o tenho certeza por que era PG13. Eu recomendo este filme para quem gosta de com\xc3\xa9dia, drama, romance e muito mais!'],
       [b'Quando vi esse filme pela primeira vez, eu estava com meu pai. Ele me incentivou a assistir a esse filme porque era um dos seus favoritos. Depois de assistir ao filme, ele instantaneamente se tornou um dos meus fa

In [None]:
vectorizer(unidecode('TensorFlow é uma biblioteca de código aberto para aprendizado de máquina aplicável a uma ampla variedade de tarefas.'))

In [142]:
vectorizer(unidecode('TensorFlow é uma biblioteca de código aberto para aprendizado de máquina aplicável a uma ampla variedade de tarefas.'))

<tf.Tensor: shape=(256,), dtype=int64, numpy=
array([ 1,  7,  1,  1,  2,  1,  1, 35,  1,  2,  1,  1,  1,  1,  1,  1,  2,
        1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,