In [1]:
import re
import pandas as pd

In [2]:
# dataset link: https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/
amzn_food_reviews_path = "../data/reviews.csv"

# reading the amazon food reviews dataset
df = pd.read_csv(amzn_food_reviews_path)

In [3]:
print("Shape of dataset:", df.shape)

Shape of dataset: (568454, 10)


In [4]:
df.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Bag-of-words [Count vectorizer]

In [5]:
from tqdm.notebook import tqdm

In [6]:
class BagOfWords:
    def __init__(self, binary_bow = False, lower = True):
        self.lower = lower
        self.binary_bow = binary_bow
        
        # input data: updated in the fit() function
        self.X = None
        self.corpus_tokens = set()
        self.unique_tokens_per_sentence = dict()
        
    def fit(self, X):
        # update the input variables
        self.X = X
        
        # preprocessing the input
        if self.lower:
            self.X = self.do_lower(self.X)
            
        # computing all the unique tokens throughout the corpus
        for s_index, sentence in enumerate(tqdm(self.X, desc="Building Vocabulary")):
            local_tokens = []   # store the tokens of current sentence
            
            for token in sentence.split(" "):
                # if token is alphabet or neumeric value then proc
                if token.isalnum():
                    
                    local_tokens.append(token)
                    
                    # if the current token is not corpus then add it
                    if token not in self.corpus_tokens:
                        self.corpus_tokens.add(token)
                        
            # cache the local tokens for transform
            self.unique_tokens_per_sentence[str(s_index)] = local_tokens
            
        # sorting the unique tokens
        self.corpus_tokens = sorted(self.corpus_tokens)
            
    def transform(self, X):
        bows = []
        
        if self.lower:
            self.X = self.do_lower(self.X)
            
        # loop through the cache
        for i, sentence_tokens in tqdm(self.unique_tokens_per_sentence.items(), desc = "Transforming"):
            vector = []
            # corpus tokens are already sorted
            for unique_token in self.corpus_tokens:
                count = sentence_tokens.count(unique_token)
                
                # for binary-BOW just insert 1
                if self.binary_bow:
                    count = min(1, count)
                    
                vector.append(count)
            bows.append(vector)
            
        # deleting the cache to save space
        # self.unique_tokens_per_sentence = None
        return pd.DataFrame(bows, columns = self.corpus_tokens)
    
    
    def do_lower(self, X):
        return list(map(str.lower, X))

In [7]:
text_corpus = df.Text.tolist()
print("Number of reviews", len(text_corpus))

Number of reviews 568454


In [8]:
# computing BOW 
bow = BagOfWords()
bow.fit(text_corpus[0:100])

Building Vocabulary:   0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
bow_vectors = bow.transform(text_corpus[:100])

Transforming:   0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
bow_vectors

Unnamed: 0,1,100,1300watt,1845,2,2x,3,30,5,50,...,y,year,years,yet,you,young,your,yrs,yummy,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,3,0,1,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# binary BOW
bow = BagOfWords(True)
bow.fit(text_corpus[0:100])

Building Vocabulary:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
binary_bow_vecs = bow.transform(text_corpus[:100])

Transforming:   0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
binary_bow_vecs

Unnamed: 0,1,100,1300watt,1845,2,2x,3,30,5,50,...,y,year,years,yet,you,young,your,yrs,yummy,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Term Frequency Inverse document frequency [TFIDF]

In [17]:
from collections import Counter

In [None]:
class TFIDF:
    def __init__(self, lower = True):
        self.lower = True
        
        self.X = None
        self.N = None # number of documents
        self.TF_dict = dict()
        self.corpus_tokens = dict()
        
    def fit(self, X):
        self.X = X
        self.N = len(self.X)
        
        # lowercasing the sentences
        if self.lower:
            self.X = self.do_lower(self.X)
            
        for index, sentence in enumerate(tqdm(self.X, "Building TFIDF")):
            # computing the term frequency
            tokens = sentence.split(" ")
            term_frequency = Counter(tokens)
            self.TF_dict[str(index)] = term_frequency
            
            for token in tokens:
                if token.isalnum() and token not in self.corpus_tokens:
                    self.corpus_tokens[token] = 0
                    
        for unique_token in self.corpus_tokens.keys():
            for tf in self.TF_dict.values():
                if unique_token in tf:
                    self.corpus_tokens[unique_token] += 1
        
    def do_lower(self, X):
        return list(map(str.lower, X))

Counter({' ': 2, 'o': 2, 'I': 1, 'a': 1, 'm': 1, 'g': 1, 'd': 1})