# Preprocessing

The goal of this notebook is to demonstrate the preprocessing methods.

In [2]:
import os
import pandas as pd
import sys
import nltk

sys.path.append(os.path.join('..', '..'))

from utilities.dataset import load_dataset
from utilities.dataset import generate_embeddings, Embeddings, Tokenizer, TextEncoder
from utilities.dataset import LowercaseTransformer

In [3]:
# First, generate word embeddings with an embedding size of 5
word_embeddings = generate_embeddings(['hello', 'world'], embedding_size=5)
print(word_embeddings)

              0         1         2         3         4
hello  0.141124  0.032013  0.078299  0.179271  0.149405
world -0.078182  0.076007 -0.012109 -0.008258  0.032848


In [4]:
# Then, augment the embeddings with special markers
# See the documentation for an overview of all available markers
enriched_word_embeddings = Embeddings(word_embeddings)
print('Vocabulary:             ', enriched_word_embeddings.get_vocab())
print('Inverse vocabulary:     ', enriched_word_embeddings.get_inverse_vocab())
print('Lookup token "world":   ', enriched_word_embeddings.lookup("world"))
print('Inverse lookup for "5": ', enriched_word_embeddings.inverse_lookup(5))
print('Get the weights:')
print(enriched_word_embeddings.get_weights())

Vocabulary:              {'__PAD__': 0, '__UNK__': 1, '__START__': 2, '__END__': 3, 'hello': 4, 'world': 5}
Inverse vocabulary:      {0: '__PAD__', 1: '__UNK__', 2: '__START__', 3: '__END__', 4: 'hello', 5: 'world'}
Lookup token "world":    5
Inverse lookup for "5":  world
Get the weights:
[[ 0.          0.          0.          0.          0.        ]
 [ 0.14112419  0.14112419  0.14112419  0.14112419  0.14112419]
 [ 0.03201258  0.03201258  0.03201258  0.03201258  0.03201258]
 [ 0.07829904  0.07829904  0.07829904  0.07829904  0.07829904]
 [ 0.14112419  0.03201258  0.07829904  0.17927146  0.14940464]
 [-0.07818223  0.07600707 -0.01210858 -0.00825751  0.03284788]]


In [5]:
# Now we can create a tokenizer using the Embeddings
word_tokenizer = Tokenizer(enriched_word_embeddings, nltk.word_tokenize, transformers=[LowercaseTransformer()])
# The subtokenizer is the nltk.word_tokenize method, which takes in a text and produces a list of tokens (words):
print('NLTK word tokenize on "hello world":\n  ', nltk.word_tokenize('hello world'))
# The LowercaseTransformer takes a word and produces the lowercase variant of that word:
lowercase_transformer = LowercaseTransformer()
print('LowercaseTransformer on "Hello World":\n  ', lowercase_transformer('Hello World'))
# The Tokenizer takes a text, tokenizes the text, produces normalized tokens using all the transformers and returns all of these!
print('Result of word_tokenizer("Hello World"):')
word_tokenizer("Hello World")

NLTK word tokenize on "hello world":
   ['hello', 'world']
LowercaseTransformer on "Hello World":
   hello world
Result of word_tokenizer("Hello World"):


{'tokens': ['__START__', 'Hello', 'World', '__END__'],
 'normalized_tokens': ['__START__', 'hello', 'world', '__END__'],
 'ids': [2, 4, 5, 3]}

In [6]:
# The same can be done for characters:
char_embeddings = Embeddings(generate_embeddings(list('abcdefghijklmnopqrstuvwxyz'), 7))
char_tokenizer = Tokenizer(char_embeddings, lambda token: list(token), transformers=[LowercaseTransformer()])
char_tokenizer('Hello')

{'tokens': ['__START__', 'H', 'e', 'l', 'l', 'o', '__END__'],
 'normalized_tokens': ['__START__', 'h', 'e', 'l', 'l', 'o', '__END__'],
 'ids': [2, 11, 8, 15, 15, 18, 3]}

In [7]:
# Both the word tokenizer and character-based tokenizer are used for the WikiPhraseDataset
# See how the following code encodes an entity and a document
word_embeddings = Embeddings(generate_embeddings(['hello', 'world'], 10))
word_tokenizer = Tokenizer(word_embeddings, nltk.word_tokenize, transformers=[LowercaseTransformer()])
char_embeddings = Embeddings(generate_embeddings(list('abcdefghijklmnopqrstuvwxyz'), 7))
char_tokenizer = Tokenizer(char_embeddings, lambda token: list(token), transformers=[LowercaseTransformer()])
text_encoder = TextEncoder(word_tokenizer, char_tokenizer)
text_encoder(entity='Python', document='Python is cool!')

{'entity__word_tokens': ['__START__', 'python', '__END__'],
 'entity__word_ids': array([2, 1, 3], dtype=int32),
 'entity__char_tokens': [['__START__', '__END__'],
  ['__START__', 'p', 'y', 't', 'h', 'o', 'n', '__END__'],
  ['__START__', '__END__']],
 'entity__char_ids': [array([2, 3], dtype=int32),
  array([ 2, 19, 28, 23, 11, 18, 17,  3], dtype=int32),
  array([2, 3], dtype=int32)],
 'document__word_tokens': ['__START__',
  'python',
  'is',
  'cool',
  '!',
  '__END__'],
 'document__word_ids': array([2, 1, 1, 1, 1, 3], dtype=int32),
 'document__char_tokens': [['__START__', '__END__'],
  ['__START__', 'p', 'y', 't', 'h', 'o', 'n', '__END__'],
  ['__START__', 'i', 's', '__END__'],
  ['__START__', 'c', 'o', 'o', 'l', '__END__'],
  ['__START__', '!', '__END__'],
  ['__START__', '__END__']],
 'document__char_ids': [array([2, 3], dtype=int32),
  array([ 2, 19, 28, 23, 11, 18, 17,  3], dtype=int32),
  array([ 2, 12, 22,  3], dtype=int32),
  array([ 2,  6, 18, 18, 15,  3], dtype=int32),
  ar