### 8.2 Text preprocessing- Text regularization

In [3]:
from nltk import sent_tokenize
import nltk
nltk.download('punkt') # download punctuation dataset for reference 

text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'

sentences = sent_tokenize(text=text_sample)
print(type(sentences), len(sentences))
print(sentences)
# sentence tokenize is meaningful if it is necessary to find semantic meaning of each sentence.
# if the order of the word is not important, word tokenization has more sense.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zephy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [4]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [5]:
# sentence tokenize and word tokenize
# if all sentences are tokenized to word, semantic meaning is no longer meaningful.
# it might be better to use n-gram which works like a moving window through the sentence
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    setences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


### Stopwords removal

In [6]:
# Stopwords means the words that are not useful for the analysis
# 'is' 'the', 'a', 'will'...etc
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zephy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
print('Number of stopwords in English:', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

Number of stopwords in English: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [8]:
# Stopwords filtering
import nltk

stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


### Stemming and Lemmatization

In [10]:
# Stemming : simple, fast
# Lemmatization : accurate, slow
# Stemmer : Porter, Lancaster, Snowball Stemmer
# Lemmatization: WordNetLemmatizer
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('ammused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus ammus
happy happiest
fant fanciest


In [12]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy
fancy fancy


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zephy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 8.3 Bag of Words - BOW

#### Sparse Matrix - COO (COOrdinate) type

In [13]:
import numpy as np

dense = np.array([[3,0,1], [0,2,0]])

In [14]:
from scipy import sparse
# non-zero data
data = np.array([3,1,2])
# row/col position
row_pos = np.array([0, 0, 1])
col_pos = np.array([0, 2, 1])
# sparse.coo_matrix
sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [18]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

* COO problem

In [20]:
from scipy import sparse
dense2 = np.array([[0,0,1,0,0,5],
             [1,4,0,3,2,5],
             [0,6,0,3,0,0],
             [2,0,0,0,0,0],
             [0,0,0,7,0,8],
             [1,0,0,0,0,0]])
# non-zero components
data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1])
# row pos / col pos -> Redundant values: two 0s, five 1s, two 2s....
#-> [0, 2, 7, 9, 10, 12] start position of each and add total number of components -> [0, 2, 7, 9, 10, 12, 13]
row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])
col_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])
# COO transform
sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))

# row_pos_ind : start location of each component and number of all components
row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])

# CSR (Compressed Sparse Row)
sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))
print("COO transformed matrix")
print(sparse_coo.toarray())
print("CSR transformed matrix")
print(sparse_csr.toarray())

COO transformed matrix
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
CSR transformed matrix
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


In [21]:
dense3 = np.array([[0,0,1,0,0,5],
             [1,4,0,3,2,5],
             [0,6,0,3,0,0],
             [2,0,0,0,0,0],
             [0,0,0,7,0,8],
             [1,0,0,0,0,0]])

coo = sparse.coo_matrix(dense3)
csr = sparse.csr_matrix(dense3)
