<a href="https://colab.research.google.com/github/jhhan0/learning-nlp/blob/main/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text preprocessing: Using nltk, keras and sklearn API to tokenize a corpus of documents


### Tokenizing the text documents

In [4]:
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
text_to_sent = sent_tokenize(text)
print(text_to_sent)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [6]:
# dictionary data structure
vocab = {}
# store lists of words in each sentence after cleaning
clean_sentences = []

# for each sentence
for sent in text_to_sent:
    # word tokenization    
    sent_to_word = word_tokenize(sent)
    # store words in each sentence after cleaning    
    result = []
    for word in sent_to_word:
        # make words to lowercase        
        word = word.lower()
        if word not in stop_words:
            if len(word) > 2:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] += 1
    clean_sentences.append(result)
print(clean_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [7]:
# Tokenizer() class allows to vectorize a text corpus. For more information, visit https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer#arguments. 
tokenizer = Tokenizer()
# fit_on_texts() method updates word vocabulary based on a parameter which is a list of tokenized texts (like words). 
tokenizer.fit_on_texts(clean_sentences)

In [8]:
# each word has its own vocab index after calling fit_on_texts() function.
print(tokenizer.word_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [9]:
# show word counts
print(tokenizer.word_counts)

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [10]:
# texts_to_sequences() transforms each word in texts to a sequence of integers.
print(tokenizer.texts_to_sequences(clean_sentences))

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


### Integer encoding and padding

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
encoded = tokenizer.texts_to_sequences(clean_sentences)
print(encoded)

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]


In [13]:
# maxlen is to set the maximum length of all sequences. If not provided, it will automatically pad sequences to the longest length among them.
# padding is to determine where the padding happens. In this case, padding happens at the end of each sequence. Use 'pre' otherwise. 
padded = pad_sequences(encoded, maxlen=10, padding='pre')
print(padded)

[[ 0  0  0  0  0  0  0  0  1  5]
 [ 0  0  0  0  0  0  0  1  8  5]
 [ 0  0  0  0  0  0  0  1  3  5]
 [ 0  0  0  0  0  0  0  0  9  2]
 [ 0  0  0  0  0  0  2  4  3  2]
 [ 0  0  0  0  0  0  0  0  3  2]
 [ 0  0  0  0  0  0  0  1  4  6]
 [ 0  0  0  0  0  0  0  1  4  6]
 [ 0  0  0  0  0  0  0  1  4  2]
 [ 0  0  0  7  7  3  2 10  1 11]
 [ 0  0  0  0  0  0  1 12  3 13]]


### One-hot encoding

In [14]:
from tensorflow.keras.utils import to_categorical
text = 'Nike is better than Addias. However, Addias is cheaper than Nike. Nike and Addias are both good.'

In [15]:
one_hot_tokenizer = Tokenizer()
one_hot_tokenizer.fit_on_texts([text])
print(one_hot_tokenizer.word_counts)
print(one_hot_tokenizer.word_index)

OrderedDict([('nike', 3), ('is', 2), ('better', 1), ('than', 2), ('addias', 3), ('however', 1), ('cheaper', 1), ('and', 1), ('are', 1), ('both', 1), ('good', 1)])
{'nike': 1, 'addias': 2, 'is': 3, 'than': 4, 'better': 5, 'however': 6, 'cheaper': 7, 'and': 8, 'are': 9, 'both': 10, 'good': 11}


In [16]:
encoded = one_hot_tokenizer.texts_to_sequences([text])[0]
print(encoded)

[1, 3, 5, 4, 2, 6, 2, 3, 7, 4, 1, 1, 8, 2, 9, 10, 11]


In [None]:
# to_categorical() converts integers to binary class matrix. https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical?version=nightly.
one_hot = to_categorical(encoded)
print(one_hot)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## Using scikit learn for text feature extraction and TF-IDF term weighting

If you want to know more about sklearn text feature extraction and linear kernel, visit https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction for more detailed information.

### Bag of Words (BoW)

In [None]:
# Using sklearn CountVectorizer class
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = ['A modern programming language that makes developers happier.']
vector = CountVectorizer(stop_words='english')
# count each vocab in a corpus and count its frequency
print(vector.fit_transform(corpus).toarray())
# each token is assigned to each index (note that CountVectorizer automatically cleans (removes) vocabs with 2 or less length before proceeding tokenization)
print(vector.vocabulary_)

[[1 1 1 1 1 1]]
{'modern': 4, 'programming': 5, 'language': 2, 'makes': 3, 'developers': 0, 'happier': 1}


### TF-IDF (Term Frequency - Inverse Document Frequency)

In [None]:
# Using sklearn TfidfVectorizer class
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# TF-IDF generally compute a score (percentage) for each word to signify its importance in the document.
# Ref: https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089#:~:text=TF%2DIDF%20stands%20for%20%E2%80%9CTerm,Information%20Retrieval%20and%20Text%20Mining.

corpus = [
  'You know I want your love.',
  'I like you',
  'What should I do?'
]

tfidf = TfidfVectorizer()
print(tfidf.fit_transform(corpus).toarray())
print(tfidf.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
