## CTWDoc2vec: TFIDF Weighted Doc2vec (Centred form, Scaled idf)

In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def l2_normalize(x):
    """
    L2 Normalization
    """
    # Calculate the modulus of x
    x_norm = math.sqrt(sum([xi**2 for xi in x]))
    # Return the normalized vector
    return [xi / x_norm for xi in x]

def tfidf_vectorizer(corpus, smooth_idf=True):
    """
    Calculate the TFIDF value
    """
    vectorizer = CountVectorizer()
    # Count the number of times each word appears in all documents
    X = vectorizer.fit_transform(corpus)
    # Get all text keywords in the bag of words
    words = vectorizer.get_feature_names_out()
    # Number of documents
    n = X.shape[0]
    # Matrix for storing TFIDF results
    tfidf_matrix = []
    # Iterate each document and calculate the TFIDF value
    for i in range(n):
        # Get the vector representation of the i-th document
        row = X[i].toarray()[0]
        # List of storing tf values
        tf_scores = []
        # List of storing idf values
        idf_scores = []
        # List of storing TFIDF values
        tfidf_scores = []
        # Iterate each keyword and calculate the TFIDF value
        for j, word in enumerate(words):
            # Calculate tf value
            tf = row[j] / sum(row)
            # Calculate df value
            df = X[:, j].count_nonzero()
            # Calculate idf value, with smooth_idf (default) or without smooth_idf
            if smooth_idf:
                idf = math.log((1 + n) / (1 + df)) + 1
            else:
                idf = math.log(n / df) + 1
            # Store tf value in a list
            tf_scores.append(tf)
            # Store idf value in a list
            idf_scores.append(idf)
        # Min-max normalize idf to (0, 1)
        for j, idf in enumerate(idf_scores):
            idf_scores[j] = (idf - min(idf_scores)) / (max(idf_scores) - min(idf_scores))
        # Calculate the TFIDF value
        for j in range(len(tf_scores)):
            tfidf_scores.append(tf_scores[j] * idf_scores[j])
        # L2 normalize the tfidf_scores vector: l2_normalize(tfidf_scores)
        # Store the list of TFIDF values for each document in the matrix
        tfidf_matrix.append(l2_normalize(tfidf_scores))
    return words, np.array(tfidf_matrix)

In [2]:
def tfidf_centre(corpus, smooth_idf=True):
    """
    Calculate the TFIDF_centre value
    """
    vectorizer = CountVectorizer()
    # Count the number of times each word appears in all documents
    X = vectorizer.fit_transform(corpus)
    # Get all text keywords in the bag of words
    words = vectorizer.get_feature_names_out()
    # Number of documents
    n = X.shape[0]
    # List of storing tf_centre values
    tf_centre_scores = []
    # List of storing idf values
    idf_scores = []
    # List of storing TFIDF_centre values
    tfidf_centre_list = []
    for j, word in enumerate(words):
        # Calculate tf_centre value
        tf_centre = X[:, j].sum() / X.sum()
        # Calculate df value
        df = X[:, j].count_nonzero()
        # Calculate idf value, with smooth_idf (default) or without smooth_idf
        if smooth_idf:
            idf = math.log((1 + n) / (1 + df)) + 1
        else:
            idf = math.log(n / df) + 1
        # Store tf_centre value in a list
        tf_centre_scores.append(tf_centre)
        # Store idf value in a list
        idf_scores.append(idf)
    # Min-max normalize idf to (0, 1)
    for j, idf in enumerate(idf_scores):
        idf_scores[j] = (idf - min(idf_scores)) / (max(idf_scores) - min(idf_scores))
    # Calculate the TFIDF_centre value
    for j in range(len(tf_centre_scores)):
        tfidf_centre_list.append(tf_centre_scores[j] * idf_scores[j])
    return np.array(tfidf_centre_list)

In [3]:
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The quick brown fox is very agile",
    "The quick brown fox is very smart",
    "The quick brown fox is very quick"
]

In [4]:
words, tfidf_scores = tfidf_vectorizer(documents)
print(words)
print(tfidf_scores)

['agile' 'brown' 'dog' 'fox' 'is' 'jumps' 'lazy' 'over' 'quick' 'smart'
 'the' 'very']
[[0.         0.         0.37220553 0.19423228 0.         0.37220553
  0.37220553 0.37220553 0.19423228 0.         0.60860481 0.        ]
 [0.52555488 0.         0.         0.27425634 0.33545487 0.
  0.         0.         0.27425634 0.         0.42967555 0.52555488]
 [0.         0.         0.         0.27425634 0.33545487 0.
  0.         0.         0.27425634 0.52555488 0.42967555 0.52555488]
 [0.         0.         0.         0.28146379 0.34427062 0.
  0.         0.         0.56292758 0.         0.44096741 0.53936645]]


In [5]:
tfidf_centre_scores = tfidf_centre(documents)
print(tfidf_centre_scores)

[0.03333333 0.         0.03333333 0.06957886 0.0638287  0.03333333
 0.03333333 0.03333333 0.08697358 0.03333333 0.13626092 0.1       ]


## Comparison with sklearn results

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Creating a TF-IDF vectorizer by TfidfVectorizer()
vectorizer_tfidf = TfidfVectorizer()

In [7]:
# Vectorize the data set
vectorized_documents = vectorizer_tfidf.fit_transform(documents)
vectorized_documents_list = vectorized_documents.toarray()

In [8]:
vectorized_documents_list

array([[0.        , 0.21472541, 0.41147631, 0.21472541, 0.        ,
        0.41147631, 0.41147631, 0.41147631, 0.21472541, 0.        ,
        0.42945081, 0.        ],
       [0.58680608, 0.30621975, 0.        , 0.30621975, 0.37455072,
        0.        , 0.        , 0.        , 0.30621975, 0.        ,
        0.30621975, 0.37455072],
       [0.        , 0.30621975, 0.        , 0.30621975, 0.37455072,
        0.        , 0.        , 0.        , 0.30621975, 0.58680608,
        0.30621975, 0.37455072],
       [0.        , 0.3163518 , 0.        , 0.3163518 , 0.38694366,
        0.        , 0.        , 0.        , 0.63270359, 0.        ,
        0.3163518 , 0.38694366]])