# Preprocessing

The goal of this notebook is to demonstrate the preprocessing methods.

In [1]:
import os
import pandas as pd
import sys
import nltk

sys.path.append(os.path.join('..', '..'))

from utilities.dataset import load_dataset, generate_embeddings, Embeddings, Tokenizer, TextEncoder, LowercaseTransformer, compute_phrase_mask, create_wikiphrase_dataset

  from collections import Sequence


In [2]:
# First, generate word embeddings with an embedding size of 5
word_embeddings = generate_embeddings(['hello', 'world'], embedding_size=5)
print(word_embeddings)

              0         1         2         3         4
hello  0.141124  0.032013  0.078299  0.179271  0.149405
world -0.078182  0.076007 -0.012109 -0.008258  0.032848


In [3]:
# Then, augment the embeddings with special markers
# See the documentation for an overview of all available markers
enriched_word_embeddings = Embeddings(word_embeddings)
print('Vocabulary:             ', enriched_word_embeddings.get_vocab())
print('Inverse vocabulary:     ', enriched_word_embeddings.get_inverse_vocab())
print('Lookup token "world":   ', enriched_word_embeddings.lookup("world"))
print('Inverse lookup for "5": ', enriched_word_embeddings.inverse_lookup(5))
print('Get the weights:')
print(enriched_word_embeddings.get_weights())

Vocabulary:              {'__PAD__': 0, '__UNK__': 1, '__START__': 2, '__END__': 3, 'hello': 4, 'world': 5}
Inverse vocabulary:      {0: '__PAD__', 1: '__UNK__', 2: '__START__', 3: '__END__', 4: 'hello', 5: 'world'}
Lookup token "world":    5
Inverse lookup for "5":  world
Get the weights:
[[ 0.          0.          0.          0.          0.        ]
 [ 0.14112419  0.14112419  0.14112419  0.14112419  0.14112419]
 [ 0.03201258  0.03201258  0.03201258  0.03201258  0.03201258]
 [ 0.07829904  0.07829904  0.07829904  0.07829904  0.07829904]
 [ 0.14112419  0.03201258  0.07829904  0.17927146  0.14940464]
 [-0.07818223  0.07600707 -0.01210858 -0.00825751  0.03284788]]


In [4]:
# Now we can create a tokenizer using the Embeddings
word_tokenizer = Tokenizer(enriched_word_embeddings, nltk.word_tokenize, transformers=[LowercaseTransformer()])
# The subtokenizer is the nltk.word_tokenize method, which takes in a text and produces a list of tokens (words):
print('NLTK word tokenize on "hello world":\n  ', nltk.word_tokenize('hello world'))
# The LowercaseTransformer takes a word and produces the lowercase variant of that word:
lowercase_transformer = LowercaseTransformer()
print('LowercaseTransformer on "Hello World":\n  ', lowercase_transformer('Hello World'))
# The Tokenizer takes a text, tokenizes the text, produces normalized tokens using all the transformers and returns all of these!
print('Result of word_tokenizer("Hello World"):')
word_tokenizer("Hello World")

NLTK word tokenize on "hello world":
   ['hello', 'world']
LowercaseTransformer on "Hello World":
   hello world
Result of word_tokenizer("Hello World"):


{'tokens': ['__START__', 'Hello', 'World', '__END__'],
 'normalized_tokens': ['__START__', 'hello', 'world', '__END__'],
 'ids': [2, 4, 5, 3]}

In [5]:
# The same can be done for characters:
char_embeddings = Embeddings(generate_embeddings(list('abcdefghijklmnopqrstuvwxyz'), 7))
char_tokenizer = Tokenizer(char_embeddings, lambda token: list(token), transformers=[LowercaseTransformer()])
char_tokenizer('Hello')

{'tokens': ['__START__', 'H', 'e', 'l', 'l', 'o', '__END__'],
 'normalized_tokens': ['__START__', 'h', 'e', 'l', 'l', 'o', '__END__'],
 'ids': [2, 11, 8, 15, 15, 18, 3]}

In [6]:
# Both the word tokenizer and character-based tokenizer are used for the WikiPhraseDataset
# See how the following code encodes an entity and a document
word_embeddings = Embeddings(generate_embeddings(['hello', 'world'], 10))
word_tokenizer = Tokenizer(word_embeddings, nltk.word_tokenize, transformers=[LowercaseTransformer()])
char_embeddings = Embeddings(generate_embeddings(list('abcdefghijklmnopqrstuvwxyz'), 7))
char_tokenizer = Tokenizer(char_embeddings, lambda token: list(token), transformers=[LowercaseTransformer()])
text_encoder = TextEncoder(word_tokenizer, char_tokenizer)
text_encoder(entity='Python', document='Python is cool!')

{'entity__word_tokens': ['__START__', 'python', '__END__'],
 'entity__word_ids': array([2, 1, 3], dtype=int32),
 'entity__char_tokens': [['__START__', '__END__'],
  ['__START__', 'p', 'y', 't', 'h', 'o', 'n', '__END__'],
  ['__START__', '__END__']],
 'entity__char_ids': [array([2, 3], dtype=int32),
  array([ 2, 19, 28, 23, 11, 18, 17,  3], dtype=int32),
  array([2, 3], dtype=int32)],
 'document__word_tokens': ['__START__',
  'python',
  'is',
  'cool',
  '!',
  '__END__'],
 'document__word_ids': array([2, 1, 1, 1, 1, 3], dtype=int32),
 'document__char_tokens': [['__START__', '__END__'],
  ['__START__', 'p', 'y', 't', 'h', 'o', 'n', '__END__'],
  ['__START__', 'i', 's', '__END__'],
  ['__START__', 'c', 'o', 'o', 'l', '__END__'],
  ['__START__', '!', '__END__'],
  ['__START__', '__END__']],
 'document__char_ids': [array([2, 3], dtype=int32),
  array([ 2, 19, 28, 23, 11, 18, 17,  3], dtype=int32),
  array([ 2, 12, 22,  3], dtype=int32),
  array([ 2,  6, 18, 18, 15,  3], dtype=int32),
  ar

In [7]:
# Now compute a phrase mask, that is all start words of phrases are masked as 2, all non start words of phrases are masked as 1 and all words not belonging to a phrase are marked as 0
compute_phrase_mask('Hello world, this is a test! This is the second sentence. This is another sentence. And this is a test!', ['this is a test', 'another sentence'], word_tokenizer)

array([0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 2,
       1, 1, 1, 0, 0], dtype=int32)

In [8]:
# Now load the phrases and create the WikiPhrase dataset
df_phrases = load_dataset(os.path.join('..', '..', 'data', 'wikiphrase'))
dataset = create_wikiphrase_dataset(df_phrases, text_encoder)

  df_selected = df_selected[df.annotator == annotator]
  df_selected = df_selected[df.entity == entity]


In [14]:
# Notice that the dataset is just a Dataframe
dataset.sample(3)

Unnamed: 0,annotator,entity,entity__char_ids,entity__char_tokens,entity__word_ids,entity__word_tokens,kb,kb__char_ids,kb__char_tokens,kb__word_ids,kb__word_tokens,phrase_mask,salience,text,text__char_ids,text__char_tokens,text__word_ids,text__word_tokens
63,kevin,Afghanistan,"[[2, 3], [2, 4, 9, 10, 11, 4, 17, 12, 22, 23, ...","[[__START__, __END__], [__START__, a, f, g, h,...","[2, 1, 3]","[__START__, afghanistan, __END__]",Afghanistan ( (listen); Pashto/Dari: افغانستان...,"[[2, 3], [2, 4, 9, 10, 11, 4, 17, 12, 22, 23, ...","[[__START__, __END__], [__START__, a, f, g, h,...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, afghanistan, (, (, listen, ), ;, p...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.333333,"A ""critical"" campaign ad, launched by the US R...","[[2, 3], [2, 4, 3], [2, 1, 1, 3], [2, 6, 21, 1...","[[__START__, __END__], [__START__, a, __END__]...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, a, ``, critical, '', campaign, ad,..."
34,kevin,AOL,"[[2, 3], [2, 4, 18, 15, 3], [2, 3]]","[[__START__, __END__], [__START__, a, o, l, __...","[2, 1, 3]","[__START__, aol, __END__]","AOL (stylized as Aol., formerly a company know...","[[2, 3], [2, 4, 18, 15, 3], [2, 1, 3], [2, 22,...","[[__START__, __END__], [__START__, a, o, l, __...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, aol, (, stylized, as, aol., ,, for...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.5,Internet rivals Microsoft and Yahoo have agree...,"[[2, 3], [2, 12, 17, 23, 8, 21, 17, 8, 23, 3],...","[[__START__, __END__], [__START__, i, n, t, e,...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, internet, rivals, microsoft, and, ..."
77,kevin,Atlanta,"[[2, 3], [2, 4, 23, 15, 4, 17, 23, 4, 3], [2, 3]]","[[__START__, __END__], [__START__, a, t, l, a,...","[2, 1, 3]","[__START__, atlanta, __END__]","Atlanta () is the capital of, and the most pop...","[[2, 3], [2, 4, 23, 15, 4, 17, 23, 4, 3], [2, ...","[[__START__, __END__], [__START__, a, t, l, a,...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, atlanta, (, ), is, the, capital, o...","[0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0.666667,A man infected with extensively drug resistant...,"[[2, 3], [2, 4, 3], [2, 16, 4, 17, 3], [2, 12,...","[[__START__, __END__], [__START__, a, __END__]...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[__START__, a, man, infected, with, extensivel..."


In [11]:
# Load an example
example = dataset.sample(1).iloc[0]

# Display the entity word tokens
print('Entity word tokens:')
print(example['entity__word_tokens'])
print()

# Display the knowledge base word tokens
print('Knowledge base word tokens:')
print(example['kb__word_tokens'])
print()

# Display a combination of the phrase mask and the text tokens
print('Text tokens with phrase mask:')
list(zip(example['phrase_mask'], example['text__word_tokens']))

Entity word tokens:
['__START__', 'digital', 'camera', '__END__']

Knowledge base word tokens:
['__START__', 'a', 'digital', 'camera', 'or', 'digicam', 'is', 'a', 'camera', 'that', 'captures', 'photographs', 'in', 'digital', 'memory', '.', 'most', 'cameras', 'produced', 'today', 'are', 'digital', ',', 'and', 'while', 'there', 'are', 'still', 'dedicated', 'digital', 'cameras', ',', 'many', 'more', 'are', 'now', 'incorporated', 'into', 'devices', 'ranging', 'from', 'mobile', 'devices', 'to', 'vehicles', '.', '__END__']

Text tokens with phrase mask:


[(0, '__START__'),
 (0, 'south'),
 (0, 'korean'),
 (0, 'electronics'),
 (0, 'giant'),
 (0, 'samsung'),
 (0, 'has'),
 (0, 'begun'),
 (0, 'mass'),
 (0, 'production'),
 (0, 'of'),
 (0, 'a'),
 (0, 'new'),
 (0, '4'),
 (0, 'gigabit'),
 (0, 'flash'),
 (0, 'memory'),
 (0, '.'),
 (0, 'this'),
 (0, 'is'),
 (0, 'twice'),
 (0, 'the'),
 (0, 'size'),
 (0, 'of'),
 (0, 'almost'),
 (0, 'all'),
 (0, 'the'),
 (0, 'biggest'),
 (0, 'solid'),
 (0, 'state'),
 (0, 'memory'),
 (0, 'devices'),
 (0, 'currently'),
 (0, 'available'),
 (0, '.'),
 (0, 'the'),
 (0, 'firm'),
 (0, 'is'),
 (0, 'using'),
 (0, 'a'),
 (0, 'new'),
 (0, '70-nanometer'),
 (0, 'process'),
 (0, 'which'),
 (0, 'enables'),
 (0, 'it'),
 (0, 'to'),
 (0, 'produce'),
 (0, 'the'),
 (0, 'smallest'),
 (0, 'cell'),
 (0, 'on'),
 (0, 'the'),
 (0, 'market'),
 (0, ','),
 (0, 'enabling'),
 (0, 'a'),
 (0, 'far'),
 (0, 'higher'),
 (0, 'density'),
 (0, 'than'),
 (0, 'competitors'),
 (0, '.'),
 (0, 'the'),
 (0, 'device'),
 (0, 'can'),
 (0, 'write'),
 (0, 'data'),