**TF-IDF example**

TF-idf from sklearn: 4>3>2>1>5

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

corpus = [
    "The duck loves to eat the worm",
    "The worm doesn’t like the early bird",
    "The bird loves to get up early to get the worm",
    "The bird gets the worm from the early duck",
    "The duck and the birds are so different from each other but one thing they have in common is that they both get the worm"
]
query = "The early bird gets the worm"

def tfIdfSklearn():
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    idf = vectorizer.idf_
    emb = vectorizer.transform([query])

    row_squares = X.multiply(X).sum(axis=1)
    row_norms = np.sqrt(row_squares)
    cos_sim = (X @ emb.T) / row_norms #omit emb norm, since it doesn/t affect retrieval

    print(cos_sim)
    
tfIdfSklearn()

<COOrdinate sparse matrix of dtype 'float64'
	with 5 stored elements and shape (5, 1)>
  Coords	Values
  (0, 0)	0.10214081712273161
  (1, 0)	0.43636683282285316
  (2, 0)	0.5789137387733785
  (3, 0)	0.9086568789254059
  (4, 0)	0.07204972456387403


**TF-IDF from scratch**
retrieval sequence: 2->5->3->1->4

In [101]:
import math
from typing import List
terms = "bird, duck, worm, early, get, love".split(', ') #6 terms

m = len(corpus)
n = len(terms)

def my_tf_idf():
    def get_tf(doc: str, terms: List[str]) -> float:
        min_len = min([len(t) for t in terms])
        max_len = max([len(t) for t in terms])
        total_count = 0
        term_count = {key:0 for key in terms}
        for i in range(len(doc)):
            for word_len in range(min_len, max_len+1):
                word = doc[i:i+word_len+1]
                if word in terms:
                    term_count[word] += 1
                    total_count += 1

        if total_count == 0:
            return [0] * n
        else:
            return [term_count[terms[i]] / total_count for i in range(n)]

    def get_idf(term:str) -> float: # docs = corpus
        count = 0
        for doc in corpus:
            if term in doc:
                count += 1
        # print(f"{term}: {count}")
        return math.log(m/(1+count))

    tf = [get_tf(doc, terms) for doc in corpus]
    idf = [get_idf(term) for term in terms]

    tf_idf = np.array(tf) * (np.ones(shape=(m,1)) @ np.array(idf).reshape(1,-1))

    query_tf = get_tf(query, terms)
    query_emb = np.array([query_tf[i] * idf[i] for i in range(n)]).reshape(-1,1) # query tf-idf

    cos_sim = (tf_idf @ query_emb) / np.linalg.norm(tf_idf, axis=1).reshape(-1,1)
    print(cos_sim)

my_tf_idf()

[[0.07661557]
 [0.10377449]
 [0.08936724]
 [0.04103002]
 [0.10128814]]


v2 with no preprocessing: 2->4->3->1->5

In [102]:
import math
from typing import List
terms = "bird, duck, worm, early, get, love".split(', ') #6 terms

m = len(corpus)
n = len(terms)

def my_tf_idf_v2():
    def get_tf(doc: str, terms: List[str]) -> float:                
        words = doc.split(' ')
        words_filtered = [w for w in words if w in terms]
        total_count = len(words_filtered)
        term_count = {key:0 for key in terms}

        for word in words_filtered:
            term_count[word] += 1

        if total_count == 0:
            return [0] * n
        else:
            return [term_count[terms[i]] / total_count for i in range(n)]

    def get_idf(term:str) -> float: # docs = corpus
        count = 0
        for doc in corpus:
            if term in doc.split(' '):
                count += 1
        # print(f"{term}: {count}")
        return math.log(m/(1+count))

    tf = [get_tf(doc, terms) for doc in corpus]
    idf = [get_idf(term) for term in terms]

    tf_idf = np.array(tf) * (np.ones(shape=(m,1)) @ np.array(idf).reshape(1,-1))

    query_tf = get_tf(query, terms)
    query_emb = np.array([query_tf[i] * idf[i] for i in range(n)]).reshape(-1,1) # query tf-idf

    cos_sim = (tf_idf @ query_emb) / np.linalg.norm(tf_idf, axis=1).reshape(-1,1)
    print(cos_sim)

my_tf_idf_v2()

[[0.03845265]
 [0.1214849 ]
 [0.040818  ]
 [0.10360755]
 [0.01889253]]
