# NLP: Tokenization and Vectorization
Author: **[Carl McBride Ellis](https://u-tad.com/claustro/carl-mcbride-ellis/)**

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
document = """
           A very simple tokenizer with REGEX; we simply
           extract the words as they are using REGEX,
           which then become the 'tokens'.
           """

### Tokenization using `re`
Extract the individual words from a document using REGEX

In [3]:
import re

document = document.lower() # everything to lower case
tokenizer = lambda x: re.findall(r'[^\W]+', x)
tokens = tokenizer(document)
tokens

['a',
 'very',
 'simple',
 'tokenizer',
 'with',
 'regex',
 'we',
 'simply',
 'extract',
 'the',
 'words',
 'as',
 'they',
 'are',
 'using',
 'regex',
 'which',
 'then',
 'become',
 'the',
 'tokens']

## Vectorization "by hand"

In [4]:
# count how many times we see a given word

def word_counter(words):
    word_count = {}
    for word in (sorted(words)):
        word_count[word] = word_count.get(word, 0) + 1
    return word_count

word_count = word_counter(tokens)
word_count

{'a': 1,
 'are': 1,
 'as': 1,
 'become': 1,
 'extract': 1,
 'regex': 2,
 'simple': 1,
 'simply': 1,
 'the': 2,
 'then': 1,
 'they': 1,
 'tokenizer': 1,
 'tokens': 1,
 'using': 1,
 'very': 1,
 'we': 1,
 'which': 1,
 'with': 1,
 'words': 1}

Here is our vector:

In [5]:
pd.DataFrame([word_count])

Unnamed: 0,a,are,as,become,extract,regex,simple,simply,the,then,they,tokenizer,tokens,using,very,we,which,with,words
0,1,1,1,1,1,2,1,1,2,1,1,1,1,1,1,1,1,1,1


## Vectorization using SciKit [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [6]:
corpus = ["Hello World!",
          "That was my Hello World with no REGEX",
          "This now has world + another hello world",
          "Spaceship world"
         ]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(corpus)
count_array  = count_matrix.toarray()
pd.DataFrame(data = count_array,
             columns = vectorizer.get_feature_names_out())

Unnamed: 0,another,has,hello,my,no,now,regex,spaceship,that,this,was,with,world
0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,0,0,1,1,1,0,1,0,1,0,1,1,1
2,1,1,1,0,0,1,0,0,0,1,0,0,2
3,0,0,0,0,0,0,0,1,0,0,0,0,1


### now with the bigrams as well

In [8]:
vectorizer_2 = CountVectorizer(analyzer='word', ngram_range=(1, 2))

count_matrix = vectorizer_2.fit_transform(corpus)
count_array  = count_matrix.toarray()
vectorizer_2.get_feature_names_out()

array(['another', 'another hello', 'has', 'has world', 'hello',
       'hello world', 'my', 'my hello', 'no', 'no regex', 'now',
       'now has', 'regex', 'spaceship', 'spaceship world', 'that',
       'that was', 'this', 'this now', 'was', 'was my', 'with', 'with no',
       'world', 'world another', 'world with'], dtype=object)

In [9]:
pd.DataFrame(data = count_array,
             columns = vectorizer_2.get_feature_names_out())

Unnamed: 0,another,another hello,has,has world,hello,hello world,my,my hello,no,no regex,now,now has,regex,spaceship,spaceship world,that,that was,this,this now,was,was my,with,with no,world,world another,world with
0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,1,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,1,1,0,1
2,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,2,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0


## Vectorization using SciKit [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
This applies [term frequency–inverse document frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) (tf-idf) to the `CountVectorizer` output.

In [10]:
# corpus = ["Hello World!",
#           "That was my Hello World",
#           "This now has world + another hello world",
#           "Spaceship world"
#          ]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(smooth_idf=False, norm=None)

count_matrix = vectorizer.fit_transform(corpus)
count_array  = count_matrix.toarray()
pd.DataFrame(data = count_array,
             columns = vectorizer.get_feature_names_out())

Unnamed: 0,another,has,hello,my,no,now,regex,spaceship,that,this,was,with,world
0,0.0,0.0,1.287682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.287682,2.386294,2.386294,0.0,2.386294,0.0,2.386294,0.0,2.386294,2.386294,1.0
2,2.386294,2.386294,1.287682,0.0,0.0,2.386294,0.0,0.0,0.0,2.386294,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386294,0.0,0.0,0.0,0.0,1.0


### now with the bigrams as well

In [12]:
vectorizer = TfidfVectorizer(smooth_idf=False, norm=None, ngram_range=(1, 2))

count_matrix = vectorizer.fit_transform(corpus)
count_array  = count_matrix.toarray()
pd.DataFrame(data = count_array,
             columns = vectorizer.get_feature_names_out())

Unnamed: 0,another,another hello,has,has world,hello,hello world,my,my hello,no,no regex,now,now has,regex,spaceship,spaceship world,that,that was,this,this now,was,was my,with,with no,world,world another,world with
0,0.0,0.0,0.0,0.0,1.287682,1.287682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.287682,1.287682,2.386294,2.386294,2.386294,2.386294,0.0,0.0,2.386294,0.0,0.0,2.386294,2.386294,0.0,0.0,2.386294,2.386294,2.386294,2.386294,1.0,0.0,2.386294
2,2.386294,2.386294,2.386294,2.386294,1.287682,1.287682,0.0,0.0,0.0,0.0,2.386294,2.386294,0.0,0.0,0.0,0.0,0.0,2.386294,2.386294,0.0,0.0,0.0,0.0,2.0,2.386294,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386294,2.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Keras [TextVectorization](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization/) for use in a neural network "embedding" layer
See also [KerasNLP Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)

In [13]:
from keras.layers import TextVectorization

vectorize_layer = TextVectorization(
    max_tokens=100,
    output_mode='int',
    output_sequence_length=50)

vectorize_layer.adapt(corpus)
input_data = corpus
vectorize_layer(input_data)

<tf.Tensor: shape=(4, 50), dtype=int64, numpy=
array([[ 3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 7,  5, 12,  3,  2,  4, 11,  9,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 6, 10, 13,  2, 14,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 8,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

In [14]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 np.str_('world'),
 np.str_('hello'),
 np.str_('with'),
 np.str_('was'),
 np.str_('this'),
 np.str_('that'),
 np.str_('spaceship'),
 np.str_('regex'),
 np.str_('now'),
 np.str_('no'),
 np.str_('my'),
 np.str_('has'),
 np.str_('another')]

# Using [NLTK](https://www.nltk.org/) (the Natural Language Toolkit)

In [15]:
import nltk
nltk.download('punkt_tab');

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### tokenization

In [16]:
nltk.word_tokenize(corpus[1])

['That', 'was', 'my', 'Hello', 'World', 'with', 'no', 'REGEX']

## `stopwords`

In [17]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
print(sorted(stopwords.words('english')))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Remove any stopwords from our document

In [19]:
stop_words = stopwords.words('english')

new_tokens = [word for word in tokens if not word in stop_words]
new_tokens

['simple',
 'tokenizer',
 'regex',
 'simply',
 'extract',
 'words',
 'using',
 'regex',
 'become',
 'tokens']

there are other languages in NLTK

In [20]:
stopwords.fileids()

['albanian',
 'arabic',
 'azerbaijani',
 'basque',
 'belarusian',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'tamil',
 'turkish',
 'uzbek']

In [21]:
print(sorted(stopwords.words('spanish')))

['a', 'al', 'algo', 'algunas', 'algunos', 'ante', 'antes', 'como', 'con', 'contra', 'cual', 'cuando', 'de', 'del', 'desde', 'donde', 'durante', 'e', 'el', 'ella', 'ellas', 'ellos', 'en', 'entre', 'era', 'erais', 'eran', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba', 'estabais', 'estaban', 'estabas', 'estad', 'estada', 'estadas', 'estado', 'estados', 'estamos', 'estando', 'estar', 'estaremos', 'estará', 'estarán', 'estarás', 'estaré', 'estaréis', 'estaría', 'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'este', 'estemos', 'esto', 'estos', 'estoy', 'estuve', 'estuviera', 'estuvierais', 'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste', 'estuvisteis', 'estuviéramos', 'estuviésemos', 'estuvo', 'está', 'estábamos', 'estáis', 'están', 'estás', 'esté', 'estéis', 'estén', 'estés', 'fue', 'fuera', 'fuerais', 'fueran', 'fueras', 'fueron', 'fuese', 'fueseis', 'fuesen', 'fueses', 'fu

In [22]:
print(sorted(stopwords.words('tamil')))

['அங்கு', 'அங்கே', 'அடுத்த', 'அதனால்', 'அதன்', 'அதற்கு', 'அதிக', 'அதில்', 'அது', 'அதே', 'அதை', 'அந்த', 'அந்தக்', 'அந்தப்', 'அன்று', 'அல்லது', 'அவன்', 'அவரது', 'அவர்', 'அவர்கள்', 'அவள்', 'அவை', 'ஆகிய', 'ஆகியோர்', 'ஆகும்', 'இங்கு', 'இங்கே', 'இடத்தில்', 'இடம்', 'இதனால்', 'இதனை', 'இதன்', 'இதற்கு', 'இதில்', 'இது', 'இதை', 'இந்த', 'இந்தக்', 'இந்தத்', 'இந்தப்', 'இன்னும்', 'இப்போது', 'இரு', 'இருக்கும்', 'இருந்த', 'இருந்தது', 'இருந்து', 'இவர்', 'இவை', 'உன்', 'உள்ள', 'உள்ளது', 'உள்ளன', 'எந்த', 'என', 'எனக்', 'எனக்கு', 'எனப்படும்', 'எனவும்', 'எனவே', 'எனினும்', 'எனும்', 'என்', 'என்ன', 'என்னும்', 'என்பது', 'என்பதை', 'என்ற', 'என்று', 'என்றும்', 'எல்லாம்', 'ஏன்', 'ஒரு', 'ஒரே', 'ஓர்', 'கொண்ட', 'கொண்டு', 'கொள்ள', 'சற்று', 'சிறு', 'சில', 'சேர்ந்த', 'தனது', 'தன்', 'தவிர', 'தான்', 'நான்', 'நாம்', 'நீ', 'பற்றி', 'பற்றிய', 'பல', 'பலரும்', 'பல்வேறு', 'பின்', 'பின்னர்', 'பிற', 'பிறகு', 'பெரும்', 'பேர்', 'போது', 'போன்ற', 'போல', 'போல்', 'மட்டுமே', 'மட்டும்', 'மற்ற', 'மற்றும்', 'மிக', 'மிகவும்', 'மீது', 'முதல்', '

In [23]:
print(sorted(stopwords.words('hebrew')))

['אבל', 'או', 'אולי', 'אותה', 'אותה', 'אותו', 'אותו', 'אותו', 'אותי', 'אותך', 'אותם', 'אותן', 'אותנו', 'אז', 'אחר', 'אחר', 'אחרות', 'אחרי', 'אחרי', 'אחרי', 'אחרים', 'אחרת', 'אי', 'איזה', 'איך', 'אין', 'אין', 'איפה', 'איתה', 'איתו', 'איתי', 'איתך', 'איתכם', 'איתכן', 'איתם', 'איתן', 'איתנו', 'אך', 'אך', 'אל', 'אל', 'אלה', 'אלה', 'אלו', 'אלו', 'אם', 'אם', 'אנחנו', 'אני', 'אס', 'אף', 'אצל', 'אשר', 'אשר', 'את', 'את', 'אתה', 'אתכם', 'אתכן', 'אתם', 'אתן', 'באיזו מידה', 'באמצע', 'באמצעות', 'בגלל', 'בין', 'בלי', 'בלי', 'במידה', 'במקום שבו', 'ברם', 'בשביל', 'בשעה ש', 'בתוך', 'גם', 'דרך', 'הוא', 'היא', 'היה', 'היכן', 'היתה', 'היתי', 'הם', 'הן', 'הנה', 'הסיבה שבגללה', 'הרי', 'ואילו', 'ואת', 'זאת', 'זה', 'זה', 'זות', 'יהיה', 'יוכל', 'יוכלו', 'יותר', 'יכול', 'יכולה', 'יכולות', 'יכולים', 'יכל', 'יכלה', 'יכלו', 'יש', 'כאן', 'כאשר', 'כולם', 'כולן', 'כזה', 'כי', 'כיצד', 'כך', 'ככה', 'כל', 'כלל', 'כמו', 'כמו', 'כן', 'כן', 'כפי', 'כש', 'לא', 'לאו', 'לאיזו תכלית', 'לאן', 'לבין', 'לה', 'להיות', 'להם', 'להן'

In [24]:
print(sorted(stopwords.words('nepali')))

['अक्सर', 'अगाडी', 'अझै', 'अनुसार', 'अन्तर्गत', 'अन्य', 'अन्यत्र', 'अन्यथा', 'अब', 'अरु', 'अरुलाई', 'अर्को', 'अर्थात', 'अर्थात्', 'अलग', 'आए', 'आजको', 'आत्म', 'आदि', 'आफू', 'आफूलाई', 'आफ्नै', 'आफ्नो', 'आयो', 'उदाहरण', 'उनको', 'उनले', 'उप', 'उहालाई', 'एउटै', 'एक', 'एकदम', 'ओठ', 'औं', 'कतै', 'कम से कम', 'कसरी', 'कसै', 'कसैले', 'कहाँबाट', 'कहिलेकाहीं', 'का', 'का', 'कि', 'किन', 'किनभने', 'कुनै', 'कुरा', 'कृपया', 'के', 'केही', 'को', 'कोही', 'क्रमशः', 'गए', 'गयौ', 'गरि', 'गरी', 'गरेका', 'गरेको', 'गरेर', 'गरौं', 'गर्छ', 'गर्छु', 'गर्दै', 'गर्न', 'गर्नु', 'गर्नुपर्छ', 'गर्ने', 'गैर', 'चार', 'चाले', 'चाहनुहुन्छ', 'चाहन्छु', 'चाहिए', 'छ', 'छन्', 'छु', 'छू', 'छैन', 'छौं', 'जताततै', 'जब', 'जबकि', 'जसको', 'जसबाट', 'जसमा', 'जसलाई', 'जसले', 'जस्तै', 'जस्तो', 'जस्तोसुकै', 'जहाँ', 'जान', 'जाहिर', 'जुन', 'जे', 'जो', 'ठीक', 'त', 'तत्काल', 'तथा', 'तदनुसार', 'तपाई', 'तपाईको', 'तर', 'तल', 'तापनी', 'तिनिहरुलाई', 'तिनी', 'तिनीहरुको', 'तिनीहरू', 'तिमी', 'तिर', 'ती', 'तीन', 'तुरुन्तै', 'तेस्कारण', 'तेस्रो', 'त्