In [49]:
from math import log2
import pandas as pd

from tqdm.notebook import tqdm
from collections import Counter, OrderedDict

In [2]:
# dataset link: https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/
amzn_food_reviews_path = "../data/reviews.csv"

# reading the amazon food reviews dataset
df = pd.read_csv(amzn_food_reviews_path)

In [3]:
print("Shape of dataset:", df.shape)

Shape of dataset: (568454, 10)


In [4]:
df.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Bag-of-words [Count vectorizer]

In [5]:
class BOWVectorizer:
    def __init__(self, binary_bow = False, lower = True, token_sep = " "):
        self.lower = lower
        self.token_sep = token_sep
        self.binary_bow = binary_bow
        
        
    def do_lower(self, data):
        return list(map(str.lower, data))
    
    def fit(self, data):        
        # preprocessing
        if self.lower:
            data = self.do_lower(data)
        
        # buiding text corpus
        print("Building vocabulary...")
        concatenated_text = " ".join(data)
        self.vocab = sorted(set(filter(str.isalnum, concatenated_text.split(self.token_sep))))
        
        # vocabulary dict where the key: token and value: index of BOW vector
        self.vocab_dict = {token: index for index, token in enumerate(self.vocab)}
        
        # length of vocabulary
        self.vocab_length = len(self.vocab)
        
    
    def transform(self, data):
        bow_vectors = []
        
        # preprocessing based on the fit() method
        if self.lower:
            data = self.do_lower(data)
            
        # Loop over all the sentence
        for sentence in tqdm(data, desc = "Generating BOW vectors"):
            # initialize the vector with the length equal to the vocab length
            vector = [0] * self.vocab_length
            
            # split the sentence and count each tokens
            tokens = sentence.split(self.token_sep)
            token_counter = Counter(tokens)
            
            # iterate over the counter 
            for token, count in token_counter.items():
                # if the token in the vocabulary and the token is alpha numeric
                if token in self.vocab_dict.keys() and token.isalnum():
                    vector[self.vocab_dict[token]] = count

            # for binary bow clip the values
            if self.binary_bow:
                vector = list(map(lambda x: min(1, x), vector))
                
            bow_vectors.append(vector)
        
        return pd.DataFrame(bow_vectors, columns = list(self.vocab))

In [6]:
text_corpus = df.Text.tolist()[0:500]

In [7]:
bow = BOWVectorizer()
bow.fit(text_corpus)

Building vocabulary...


In [8]:
bow.vocab

['1',
 '10',
 '100',
 '1000',
 '100ml',
 '10lb',
 '11',
 '110',
 '12',
 '13',
 '1300watt',
 '15',
 '150',
 '150mg',
 '16',
 '1845',
 '1980s',
 '2',
 '20',
 '2009',
 '208f',
 '21',
 '24',
 '25',
 '2g',
 '2oz',
 '2x',
 '3',
 '30',
 '35',
 '360',
 '4',
 '40',
 '45',
 '4x',
 '5',
 '50',
 '500',
 '500g',
 '50th',
 '5lb',
 '5lbs',
 '6',
 '60',
 '7',
 '8',
 '80',
 '84',
 '8oz',
 '9',
 '90',
 'a',
 'abdominal',
 'able',
 'about',
 'absence',
 'absolute',
 'absolutely',
 'absorbs',
 'abt',
 'acai',
 'accept',
 'accompaniment',
 'according',
 'acid',
 'across',
 'activate',
 'active',
 'activity',
 'acts',
 'actual',
 'actually',
 'add',
 'added',
 'addicted',
 'addiction',
 'addicts',
 'adding',
 'addition',
 'additives',
 'addled',
 'adds',
 'adjustments',
 'admit',
 'admittedly',
 'adopted',
 'adores',
 'advertised',
 'advertising',
 'advised',
 'affairs',
 'affected',
 'affectionate',
 'afford',
 'affordable',
 'after',
 'afternoon',
 'afternoons',
 'aftertaste',
 'afterwards',
 'again',
 'a

In [9]:
bow.transform(text_corpus)

Generating BOW vectors:   0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,1,10,100,1000,100ml,10lb,11,110,12,13,...,young,younger,youngest,your,yrs,yucky,yummy,zen,zest,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
binary_bow = BOWVectorizer(binary_bow = True)
binary_bow.fit(text_corpus)


binary = binary_bow.transform(text_corpus)
binary

Building vocabulary...


Generating BOW vectors:   0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,1,10,100,1000,100ml,10lb,11,110,12,13,...,young,younger,youngest,your,yrs,yucky,yummy,zen,zest,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
binary_bow.transform(["This product is very good but it has some minor issues"])

Generating BOW vectors:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,1,10,100,1000,100ml,10lb,11,110,12,13,...,young,younger,youngest,your,yrs,yucky,yummy,zen,zest,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Term Frequecy and Inverse Document Frequency [TF-IDF]

In [59]:
class TFIDFVectorizer:
    def __init__(self, token_sep = " "):
        self.token_sep = token_sep 
        
    def do_lower(self, data):
        return list(map(str.lower, data.split(self.token_sep)))
    
    def clean_tokens(self, tokens):
        return list(filter(lambda x: x.isalnum(), tokens))
            
        
    def fit(self, data):       
        self.token_corpus = {}
        self.term_frequencies = []
        
        self.total_docs = len(data) # number of documents in TFIDF
        
        for sentence in tqdm(data, desc = 'Computing TF and building vocab'):
            # preprocessing: Lowercasing -> only alpha numeric
            tokens = self.do_lower(sentence)
            tokens = self.clean_tokens(tokens)
            
            # iterate over the tokens and add it to corpus
            for token in tokens:
                if token not in self.token_corpus:
                    self.token_corpus[token] = None
            
            # computing term frequencies
            tf = Counter(tokens)
            self.term_frequencies.append(tf)
        
        # sort the token corpus | for indexing
        self.token_corpus = dict(sorted(self.token_corpus.items()))
        self.token_corpus = {k : index for index, k in enumerate(self.token_corpus.keys())}
        self.num_tokens = len(self.token_corpus)
        
        # initialize the IDF hash table
        self.idf = {key: 0 for key in self.token_corpus}
        # computing inverse document frequency
        for unique_token in tqdm(self.token_corpus, desc = 'Computing IDF'):
            for tf in self.term_frequencies:
                if unique_token in tf:
                    self.idf[unique_token] += 1
        self.idf = {k: log2(self.total_docs / v) for k, v in self.idf.items()}
        

    def transform(self, data):
        vectors = []
        for sentence in data:            
            # preprocessing
            tokens = self.do_lower(sentence)
            tokens = self.clean_tokens(tokens)
            
            term_freq = Counter(tokens)            
            self.num_tokens_local = len(tokens)
            
            tfidf = [0] * self.num_tokens
            
            for token in tokens:
                if token in self.token_corpus:
                    tfidf[self.token_corpus[token]] = (term_freq[token] / self.num_tokens_local) * self.idf[token]
                    
            vectors.append(tfidf)
        
        return pd.DataFrame(vectors, columns = list(self.token_corpus.keys()))

In [73]:
text_corpus = df.Text.tolist()[0:100]

In [74]:
tfidf = TFIDFVectorizer()
tfidf.fit(text_corpus)

Computing TF and building vocab:   0%|          | 0/100 [00:00<?, ?it/s]

Computing IDF:   0%|          | 0/1248 [00:00<?, ?it/s]

In [75]:
tfidf.transform(text_corpus)

Unnamed: 0,1,100,1300watt,1845,2,2x,3,30,5,50,...,y,year,years,yet,you,young,your,yrs,yummy,zip
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.023851,0.0,0.000000,0.0,0.073297,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.051014,0.0,0.000000,0.0,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.120175,0.0,0.256539,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
96,0.043083,0.0,0.0,0.0,0.0,0.0,0.038618,0.0,0.0,0.0,...,0.0,0.038618,0.0,0.0,0.042057,0.0,0.020182,0.0,0.000000,0.0
97,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.052877,0.0,0.000000,0.0
98,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
