## Lite Implementation of sklearn TfidfVectorizer

In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def l2_normalize(x):
    """
    L2 Normalization
    """
    # Calculate the modulus of x
    x_norm = math.sqrt(sum([xi**2 for xi in x]))
    # Return the normalized vector
    return [xi / x_norm for xi in x]

def tfidf_vectorizer(corpus, smooth_idf=True):
    """
    Calculate the TFIDF value
    """
    vectorizer = CountVectorizer()
    # Count the number of times each word appears in all documents
    X = vectorizer.fit_transform(corpus)
    # Get all text keywords in the bag of words
    words = vectorizer.get_feature_names_out()
    # Number of documents
    n = X.shape[0]
    # Matrix for storing TFIDF results
    tfidf_matrix = []
    # Iterate each document and calculate the TFIDF value
    for i in range(n):
        # Get the vector representation of the i-th document
        row = X[i].toarray()[0]
        # List of storing TFIDF values
        tfidf_scores = []
        # Iterate each keyword and calculate the TFIDF value
        for j, word in enumerate(words):
            # Calculate tf value
            tf = row[j] / sum(row)
            # Calculate df value
            df = X[:, j].count_nonzero()
            # Calculate idf value, with smooth_idf (default) or without smooth_idf
            if smooth_idf:
                idf = math.log((1 + n) / (1 + df)) + 1
            else:
                idf = math.log(n / df) + 1
            # Calculate the TFIDF value
            tfidf = tf * idf
            # Store TFIDF value in a list
            tfidf_scores.append(tfidf)
        # L2 normalize the tfidf_scores vector: l2_normalize(tfidf_scores)
        # Store the list of TFIDF values for each document in the matrix
        tfidf_matrix.append(l2_normalize(tfidf_scores))
    return np.array(tfidf_matrix)

In [2]:
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The quick brown fox is very agile",
    "The quick brown fox is very smart",
    "The quick brown fox is very quick"
]

In [3]:
tfidf_scores = tfidf_vectorizer(documents)
print(tfidf_scores)

[[0.         0.21472541 0.41147631 0.21472541 0.         0.41147631
  0.41147631 0.41147631 0.21472541 0.         0.42945081 0.        ]
 [0.58680608 0.30621975 0.         0.30621975 0.37455072 0.
  0.         0.         0.30621975 0.         0.30621975 0.37455072]
 [0.         0.30621975 0.         0.30621975 0.37455072 0.
  0.         0.         0.30621975 0.58680608 0.30621975 0.37455072]
 [0.         0.3163518  0.         0.3163518  0.38694366 0.
  0.         0.         0.63270359 0.         0.3163518  0.38694366]]


## Comparison with sklearn results

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Creating a TF-IDF vectorizer by TfidfVectorizer()
vectorizer_tfidf = TfidfVectorizer()

In [5]:
# Vectorize the data set
vectorized_documents = vectorizer_tfidf.fit_transform(documents)
vectorized_documents_list = vectorized_documents.toarray()

In [6]:
vectorized_documents_list

array([[0.        , 0.21472541, 0.41147631, 0.21472541, 0.        ,
        0.41147631, 0.41147631, 0.41147631, 0.21472541, 0.        ,
        0.42945081, 0.        ],
       [0.58680608, 0.30621975, 0.        , 0.30621975, 0.37455072,
        0.        , 0.        , 0.        , 0.30621975, 0.        ,
        0.30621975, 0.37455072],
       [0.        , 0.30621975, 0.        , 0.30621975, 0.37455072,
        0.        , 0.        , 0.        , 0.30621975, 0.58680608,
        0.30621975, 0.37455072],
       [0.        , 0.3163518 , 0.        , 0.3163518 , 0.38694366,
        0.        , 0.        , 0.        , 0.63270359, 0.        ,
        0.3163518 , 0.38694366]])