In [1]:
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import numpy as np



stop_words = set(stopwords.words('english'))

In [2]:
# Collection of documents (corpus)

review_1 =  "The Glider is a great soccer ball run 100 @"
review_2 = "What a bad soccer ball ... " 
review_3 = "I am happy with The glider run $50"

In [3]:
docs = [review_1, review_2, review_3]
docs

['The Glider is a great soccer ball run 100 @',
 'What a bad soccer ball ... ',
 'I am happy with The glider run $50']

In [4]:

all_collection = " ".join(docs)
all_collection


'The Glider is a great soccer ball run 100 @ What a bad soccer ball ...  I am happy with The glider run $50'

In [5]:
def pre_process(doc):
    # lower case
    lower_text = doc.lower()
    
    print(lower_text)
    
    # tokenize
    tokens = word_tokenize(lower_text)
    
    print(tokens)
    # remove numbers
    words = [word for word in tokens if word.isalpha()]
    
    print(words)
    # remove stop words
    filtered_words = []
 
    for w in words:
        if w not in stop_words:
            filtered_words.append(w)
    
    print(filtered_words)
    
    # remove duplicate words
    final_words = set(filtered_words)

    final_words = sorted(list(final_words))

    return final_words


In [7]:
unique_terms = pre_process(all_collection)

print(len(unique_terms))
print(unique_terms)

the glider is a great soccer ball run 100 @ what a bad soccer ball ...  i am happy with the glider run $50
['the', 'glider', 'is', 'a', 'great', 'soccer', 'ball', 'run', '100', '@', 'what', 'a', 'bad', 'soccer', 'ball', '...', 'i', 'am', 'happy', 'with', 'the', 'glider', 'run', '$', '50']
['the', 'glider', 'is', 'a', 'great', 'soccer', 'ball', 'run', 'what', 'a', 'bad', 'soccer', 'ball', 'i', 'am', 'happy', 'with', 'the', 'glider', 'run']
['glider', 'great', 'soccer', 'ball', 'run', 'bad', 'soccer', 'ball', 'happy', 'glider', 'run']
7
['bad', 'ball', 'glider', 'great', 'happy', 'run', 'soccer']


In [8]:
# Construct a term-document matrix
# here as a Python dictionary for ease of interpretability

doc_term_matrix = {}

for term in unique_terms:
    doc_term_matrix[term] = []
    
    for doc in docs:
        if term in doc:
            doc_term_matrix[term].append(1)
        else: doc_term_matrix[term].append(0)

doc_term_matrix

{'bad': [0, 1, 0],
 'ball': [1, 1, 0],
 'glider': [0, 0, 1],
 'great': [1, 0, 0],
 'happy': [0, 0, 1],
 'run': [1, 0, 1],
 'soccer': [1, 1, 0]}

In [9]:
# The query to find all documents containing "glider" AND "run"
# Is just a bitwise AND:

docs_array = np.array(docs, dtype='object')

v1 = np.array(doc_term_matrix['glider'])    
v2 = np.array(doc_term_matrix['run'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

[0 0 1]
[1 0 1]
-------
[0 0 1]


In [10]:
# The query to find all documents containing "glider" OR "run"
# Is just a bitwise AND:
v1 = np.array(doc_term_matrix['soccer'])    
v2 = np.array(doc_term_matrix['run'])

print(v1)
print(v2)
print('-------')
v3 = v1 | v2
print(v3)

[1 1 0]
[1 0 1]
-------
[1 1 1]
