# Setup

In [53]:
import pandas as pd
from sortedcontainers import SortedSet

In [54]:
doc1 = 'This pasta is very tasty and affordable.'
doc2 = 'This pasta is not tasty and is affordable.'
doc3 = 'This pasta is delicious and cheap.'
doc4 = 'Pasta is tasty and pasta tastes good.'

## Tokenization

In [55]:
# This line defines a regular expression pattern tokenizer_re. The pattern [^a-zA-Z0-9] is used to match any character that is not a lowercase letter (a-z), an uppercase letter (A-Z), or a digit (0-9). The r before the string indicates a raw string in Python, which tells Python to interpret the backslashes in the string as literal characters and not as escape characters.
tokenizer_re = r"[^a-zA-Z0-9]"

In [56]:
# Split the sentence to a list of the words contained in the sentence (split by spaces between the words)

# This line imports Python's built-in re module, which provides support for regular expressions.
import re

# doc.lower(): This converts the entire input string doc to lowercase.
# re.sub(tokenizer_re, " ", ...): This uses the sub method to replace all characters in the input string that match the regex pattern (i.e., all non-alphanumeric characters) with a space (" ").
# .split(): This splits the modified string into a list of words or tokens.
def tokenize(doc: str) -> list():
    return re.sub(tokenizer_re, " ", doc.lower()).split()

In [57]:
# For each document greate the tokenized representation of it
l_doc1 = tokenize(doc1)
l_doc2 = tokenize(doc2)
l_doc3 = tokenize(doc3)
l_doc4 = tokenize(doc4)

In [58]:
# Print the tokenization of the first document
l_doc1

['this', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable']

# BOW Option 1

## Feature Extraction

In [59]:
# --- Create a Set of unique words per sentence ---
# SortedSet is a data structure that maintains unique elements in sorted order.
wordset = SortedSet()

# Each of these lines adds the words from a list (like l_doc1, l_doc2, etc.) to wordset. Since wordset is a set, it will only keep unique words and discard any duplicates.
wordset.update(l_doc1)
wordset.update(l_doc2)
wordset.update(l_doc3)
wordset.update(l_doc4)

SortedSet(['affordable', 'and', 'cheap', 'delicious', 'good', 'is', 'not', 'pasta', 'tastes', 'tasty', 'this', 'very'])

In [60]:
# --- Bag of Words Calculation ---
# This function takes two arguments: wordset, a set of unique words, and l_doc, a list of words
def calculate_bow(wordset, l_doc):
    tf_diz = dict.fromkeys(wordset,0) # creates a dictionary
    for word in l_doc: # iterates through each word
        tf_diz[word]=l_doc.count(word) # For each word, this line counts its occurrences in l_doc and updates the corresponding entry in tf_diz
    return tf_diz # The function returns the tf_diz dictionary, which now maps each word to its frequency in l_doc

In [61]:
# --- Calculating Bag of Words for Documents ---
# Each call generates a dictionary (e.g., bow1, bow2) where keys are words from wordset and values are their frequencies in the respective document.
bow1 = calculate_bow(wordset, l_doc1§)
bow2 = calculate_bow(wordset, l_doc2)
bow3 = calculate_bow(wordset, l_doc3)
bow4 = calculate_bow(wordset, l_doc4)

In [62]:
# --- Creating a DataFrame from BoW Data ---
df_bow = pd.DataFrame([bow1, bow2, bow3, bow4])
df_bow.head()

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


# BOW Option 2 (sklearn)

In [63]:
# --- Converting wordset to CountVectorizer datatype ---
# CountVectorizer is a popular tool in text analysis used to convert a collection of text documents into a matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer

# Converting wordset to CountVectorizer datatype
vectorizer = CountVectorizer(vocabulary=wordset)
print(vectorizer.get_feature_names_out())

['affordable' 'and' 'cheap' 'delicious' 'good' 'is' 'not' 'pasta' 'tastes'
 'tasty' 'this' 'very']


In [71]:
# fit_transform is a method that performs two functions: fit and transform.
# 1. Fit: identify all unique words in the documents
# 2. Transform: Each row of this matrix corresponds to one of the documents. The value in each cell of the matrix is the count of how many times the corresponding word appears in the respective document.
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])

# Creating a DataFrame from the Matrix (More user-friendly)
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow_sklearn.head()

Unnamed: 0,affordable,cheap,delicious,good,pasta,tastes,tasty
0,1,0,0,0,1,0,1
1,1,0,0,0,1,0,1
2,0,1,1,0,1,0,0
3,0,0,0,1,2,1,1


In [72]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow_sklearn.head()

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [74]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow_sklearn.head()

Unnamed: 0,affordable,cheap,delicious,good,pasta,tastes,tasty
0,1,0,0,0,1,0,1
1,1,0,0,0,1,0,1
2,0,1,1,0,1,0,0
3,0,0,0,1,2,1,1


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
X = vectorizer.fit_transform([doc1,doc2,doc3])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow_sklearn.head()

Unnamed: 0,affordable,cheap,delicious,delicious cheap,pasta,pasta delicious,pasta tasty,tasty,tasty affordable
0,1,0,0,0,1,0,1,1,1
1,1,0,0,0,1,0,1,1,1
2,0,1,1,1,1,1,0,0,0


# TFIDF

In [19]:
tfidf1 = "This movie is very scary and long"
tfidf2 = "This movie is not scary and is slow"
tfidf3 = "This movie is spooky and good"

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
X = vectorizer_tfidf.fit_transform([tfidf1, tfidf2, tfidf3])
df_tfidf_sklearn = pd.DataFrame(X.toarray(), columns=vectorizer_tfidf.get_feature_names())
df_tfidf_sklearn

Unnamed: 0,good,long,movie,scary,slow,spooky
0,0.0,0.720333,0.425441,0.547832,0.0,0.0
1,0.0,0.0,0.425441,0.547832,0.720333,0.0
2,0.652491,0.0,0.385372,0.0,0.0,0.652491
