# Natural Language Processing Demo

BitTiger DS501


In [3]:
from collections import Counter
import numpy as np

## Compile Documents

In [4]:
doc1 = 'Wise people think they are foolish'
doc2 = 'Foolish foolish people think they are wise wise'
doc3 = 'I am definitely wise so this irritates me'
doc4 = 'Trump is for sure like definitely foolish'

## Create Corpus

In [5]:
documents = [doc1, doc2, doc3, doc4]

## Tokenize and Lower case

In [6]:
from nltk.tokenize import word_tokenize

In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/wen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/wen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
tokenized = [word_tokenize(doc.lower()) for doc in documents]

In [11]:
tokenized

[['wise', 'people', 'think', 'they', 'are', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'they', 'are', 'wise', 'wise'],
 ['i', 'am', 'definitely', 'wise', 'so', 'this', 'irritates', 'me'],
 ['trump', 'is', 'for', 'sure', 'like', 'definitely', 'foolish']]

## Remove Stop Words

In [14]:
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

In [15]:
docs = [[word for word in words if word not in stop] 
        for words in tokenized]

In [16]:
docs

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Stemming and Lemmatization

In [18]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

docs_stem = [[porter.stem(word) for word in words]
               for words in docs]
docs_lemma = [[wordnet.lemmatize(word) for word in doc]
                for doc in docs]

[nltk_data] Downloading package wordnet to /Users/wen/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [19]:
print(porter.stem('mice'))
print(wordnet.lemmatize('mice'))

mice
mouse


In [20]:
docs_stem

[['wise', 'peopl', 'think', 'foolish'],
 ['foolish', 'foolish', 'peopl', 'think', 'wise', 'wise'],
 ['definit', 'wise', 'irrit'],
 ['trump', 'sure', 'like', 'definit', 'foolish']]

In [21]:
docs_lemma

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Vocabulary for our Corpus

In [None]:
vocabulary = [word for doc in docs_lemma for word in doc]

In [None]:
vocabulary

In [None]:
vocabulary = sorted(list(set(vocabulary)))

In [None]:
print('Vocabulary (features):', vocabulary)

## Bag of Words

In [None]:
from collections import Counter

In [None]:
def bow_vectorize(doc, vocabulary):
    bag_of_words = Counter(doc)
    doc_vector = np.zeros(len(vocabulary))
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word]
    return doc_vector

In [None]:
bow_matrix = list()
for doc in docs_lemma:
    bow_matrix.append(bow_vectorize(doc, vocabulary))


In [None]:
print('features:',vocabulary)
for i in range(len(bow_matrix)):
    print('"%s":'% docs_lemma[i], '\n', bow_matrix[i], '\n')
          
print('feature matrix:')
print(bow_matrix)

### Bag of Words with CountVectorizer

In [None]:
def lemmatize(doc):
    return [wordnet.lemmatize(word) for word in word_tokenize(doc.lower())]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary,
                                   tokenizer=lemmatize)

feature_matrix = count_vectorizer.fit_transform([doc1])

In [None]:
feature_matrix.toarray()

In [None]:
print('Vectorize:',doc1)
print('Lemmatized:',docs_lemma[0])
print('Features:', vocabulary)
print('\n')
print('sklearn result',feature_matrix)
print('our result',bow_vectorize(docs_lemma[0], vocabulary))
print('\n')
print('feature matrix')
print(count_vectorizer.fit_transform(documents).todense())

## Term Frequency (Tf)

In [None]:
def tf_vectorize(doc, vocabulary):
    bow_vector = bow_vectorize(doc, vocabulary)
    tf_vector = np.zeros(len(vocabulary))
    for idx, vec in enumerate(bow_vector):
        tf_vector[idx] = vec / len(doc)
    return tf_vector

In [None]:
tf_matrix = list()
for doc in docs_lemma:
    tf_matrix.append(tf_vectorize(doc, vocabulary))

In [None]:
print('features:', vocabulary)

for i in range(len(tf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tf_matrix[i], '\n')

## Some Tf-Idf 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [None]:
tfidf_matrix

In [None]:
print('features:',vocabulary)

for i in range(len(tfidf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tfidf_matrix[i], '\n')

## Euclidian Distance Comparison

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
bow_matrix[0]

In [None]:
bow_matrix[1]

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

## Cosine Similarity Comparison

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

In [None]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

# Search Engine Query Example

In [None]:
query = 'The foolish Trump'

In [None]:
query_vectorized = tfidf_vectorizer.transform([query]).todense()
print("Query:", query)
print("Vectorized query:", query_vectorized)

In [None]:
for doc, tf_doc in zip(documents, tfidf_matrix):
    print(doc, cosine_similarity(query_vectorized.reshape(1, -1), tf_doc.reshape(1, -1)))

## A Final Cosine Similiarity Thingy

In [None]:
for index in range(len(documents[1:])):
    print('"%s" compared with "%s"'%(documents[0], documents[index+1]))
    print('TF cosine similarity:', cosine_similarity(tf_matrix[0].reshape(1, -1),
                                                     tf_matrix[index+1].reshape(1, -1)))
    print('TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0].reshape(1, -1),
                                                         tfidf_matrix[index+1].reshape(1, -1)))
    