# Setup

In [4]:
import pandas as pd
from sortedcontainers import SortedSet

# BOW

In [5]:
doc1 = 'This pasta is very tasty and affordable.'
doc2 = 'This pasta is not tasty and is affordable.'
doc3 = 'This pasta is delicious and cheap.'
doc4 = 'Pasta is tasty and pasta tastes good.'

In [6]:
tokenizer_re = r"[^a-zA-Z0-9]"

In [7]:
import re
def tokenize(doc: str) -> list():
    return re.sub(tokenizer_re, " ", doc.lower()).split()

In [8]:
l_doc1 = tokenize(doc1)
l_doc2 = tokenize(doc2)
l_doc3 = tokenize(doc3)
l_doc4 = tokenize(doc4)

In [9]:
l_doc1

['this', 'pasta', 'is', 'very', 'tasty', 'and', 'affordable']

In [10]:
def calculate_bow(wordset, l_doc):
    tf_diz = dict.fromkeys(wordset,0)
    for word in l_doc:
        tf_diz[word]=l_doc.count(word)
    return tf_diz

In [11]:
wordset = SortedSet()
wordset.update(l_doc1)
wordset.update(l_doc2)
wordset.update(l_doc3)
wordset.update(l_doc4)

SortedSet(['affordable', 'and', 'cheap', 'delicious', 'good', 'is', 'not', 'pasta', 'tastes', 'tasty', 'this', 'very'])

In [12]:
bow1 = calculate_bow(wordset, l_doc1)
bow2 = calculate_bow(wordset, l_doc2)
bow3 = calculate_bow(wordset, l_doc3)
bow4 = calculate_bow(wordset, l_doc4)
df_bow = pd.DataFrame([bow1, bow2, bow3, bow4])

In [13]:
df_bow.head()

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=wordset)
print(vectorizer.get_feature_names())

['affordable', 'and', 'cheap', 'delicious', 'good', 'is', 'not', 'pasta', 'tastes', 'tasty', 'this', 'very']


In [15]:
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()

Unnamed: 0,affordable,and,cheap,delicious,good,is,not,pasta,tastes,tasty,this,very
0,1,1,0,0,0,1,0,1,0,1,1,1
1,1,1,0,0,0,2,1,1,0,1,1,0
2,0,1,1,1,0,1,0,1,0,0,1,0
3,0,1,0,0,1,1,0,2,1,1,0,0


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([doc1,doc2,doc3,doc4])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()

Unnamed: 0,affordable,cheap,delicious,good,pasta,tastes,tasty
0,1,0,0,0,1,0,1
1,1,0,0,0,1,0,1
2,0,1,1,0,1,0,0
3,0,0,0,1,2,1,1


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
X = vectorizer.fit_transform([doc1,doc2,doc3])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()

Unnamed: 0,affordable,cheap,delicious,delicious cheap,pasta,pasta delicious,pasta tasty,tasty,tasty affordable
0,1,0,0,0,1,0,1,1,1
1,1,0,0,0,1,0,1,1,1
2,0,1,1,1,1,1,0,0,0


# TFIDF

In [19]:
tfidf1 = "This movie is very scary and long"
tfidf2 = "This movie is not scary and is slow"
tfidf3 = "This movie is spooky and good"

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
X = vectorizer_tfidf.fit_transform([tfidf1, tfidf2, tfidf3])
df_tfidf_sklearn = pd.DataFrame(X.toarray(), columns=vectorizer_tfidf.get_feature_names())
df_tfidf_sklearn

Unnamed: 0,good,long,movie,scary,slow,spooky
0,0.0,0.720333,0.425441,0.547832,0.0,0.0
1,0.0,0.0,0.425441,0.547832,0.720333,0.0
2,0.652491,0.0,0.385372,0.0,0.0,0.652491
