# One Hot Encoding

In [7]:
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.'] #Initial data: one entry per sample 
                                                                  #in this example, a sample is a sentence,
                                                                  #but it could be an entire document
token_index = {} #Builds an index of all tokens in the data
for sample in samples:
    for word in sample.split(): #Tokenizes the samples via the split method.You also strip punctuation & special characters from the samples
        if word not in token_index:
            token_index[word] = len(token_index) + 1 #Assigns a unique index to each unique word. 
                                                     #Note that you don’t attribute index 0 to anything.
            max_length = 10 #Vectorizes the samples. You’ll only consider the first max_length words in each sample.
            results = np.zeros(shape=(len(samples),max_length,max(token_index.values()) + 1)) #This where you store the results.
            for i, sample in enumerate(samples):
                for j, word in list(enumerate(sample.split()))[:max_length]:
                    index = token_index.get(word)
                    results[i, j, index] = 1.

In [8]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [9]:
import string
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable #All printable ASCII characters
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_length = 50
results1 = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results1[i, j, index] = 1.

In [10]:
results1

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000) #Creates a tokenizer, configuredto only take into account the 1,000 most common words

tokenizer.fit_on_texts(samples) #Builds the word index

sequences = tokenizer.texts_to_sequences(samples) #Turns strings into lists of integer indices

one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') #You could also directly get the one-hot binary representations. Vectorization modes other than one-hot encoding are supported by this tokenizer

word_index = tokenizer.word_index #How you can recover the word index that was computed

print('Found %s unique tokens.' % len(word_index))

Found 9 unique tokens.


In [12]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality = 1000 #Stores the words as vectors of size 1,000. If you have close to 1,000 words (or more), 
                      #you’ll see many hash collisions, which will decrease the accuracy of this encoding method
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.

In [13]:
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])