In [21]:
# the text
text = [ 'All that we are is the result of what we have thought',
         'To be or not to be that is the question',
         'Be yourself everyone else is already taken' ]
text

['All that we are is the result of what we have thought',
 'To be or not to be that is the question',
 'Be yourself everyone else is already taken']

In [22]:
# separate into words by splitting by spaces
import re
re.split(r"\s", text[0])

['All',
 'that',
 'we',
 'are',
 'is',
 'the',
 'result',
 'of',
 'what',
 'we',
 'have',
 'thought']

In [23]:
# can recombine into a text
' '.join(re.split(r"\s", text[0]))

'All that we are is the result of what we have thought'

In [24]:
# also make lower-case
allwords = re.split(r'\s',' '.join(text).lower())
allwords

['all',
 'that',
 'we',
 'are',
 'is',
 'the',
 'result',
 'of',
 'what',
 'we',
 'have',
 'thought',
 'to',
 'be',
 'or',
 'not',
 'to',
 'be',
 'that',
 'is',
 'the',
 'question',
 'be',
 'yourself',
 'everyone',
 'else',
 'is',
 'already',
 'taken']

# Create a vocabulary (lexicon)

In [25]:
# find the unique words
vocab=sorted(set(allwords))
vocab

['all',
 'already',
 'are',
 'be',
 'else',
 'everyone',
 'have',
 'is',
 'not',
 'of',
 'or',
 'question',
 'result',
 'taken',
 'that',
 'the',
 'thought',
 'to',
 'we',
 'what',
 'yourself']

In [26]:
print(f"There are {len(allwords)} words in the text and {len(vocab)} words in the vocabulary.")

There are 29 words in the text and 21 words in the vocabulary.


# Create an encoder and decoder

In [27]:
# the encoder is a python dictionary type
word2idx={}
for i, word in enumerate(vocab):
    word2idx[word] = i
word2idx

{'all': 0,
 'already': 1,
 'are': 2,
 'be': 3,
 'else': 4,
 'everyone': 5,
 'have': 6,
 'is': 7,
 'not': 8,
 'of': 9,
 'or': 10,
 'question': 11,
 'result': 12,
 'taken': 13,
 'that': 14,
 'the': 15,
 'thought': 16,
 'to': 17,
 'we': 18,
 'what': 19,
 'yourself': 20}

In [28]:
# and a decoder
idx2word={}
for i, word in enumerate(vocab):
    idx2word[i] = word
idx2word

{0: 'all',
 1: 'already',
 2: 'are',
 3: 'be',
 4: 'else',
 5: 'everyone',
 6: 'have',
 7: 'is',
 8: 'not',
 9: 'of',
 10: 'or',
 11: 'question',
 12: 'result',
 13: 'taken',
 14: 'that',
 15: 'the',
 16: 'thought',
 17: 'to',
 18: 'we',
 19: 'what',
 20: 'yourself'}

In [29]:
print(f'The word "to" has index {word2idx["to"]}')
print(f'The index "7" maps to the word "{idx2word[7]}"')

The word "to" has index 17
The index "7" maps to the word "is"
