In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
pd.set_option("display.precision", 4)

In [3]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

tf_idf = TfidfVectorizer(norm="l2", use_idf=True, smooth_idf=True)
xs = tf_idf.fit_transform(corpus).toarray()

tf_idf_df = pd.DataFrame(xs, index=corpus, columns=tf_idf.get_feature_names_out())
tf_idf_df

Unnamed: 0,and,document,first,is,one,second,the,third,this
This is the first document.,0.0,0.4698,0.5803,0.3841,0.0,0.0,0.3841,0.0,0.3841
This document is the second document.,0.0,0.6876,0.0,0.2811,0.0,0.5386,0.2811,0.0,0.2811
And this is the third one.,0.5118,0.0,0.0,0.2671,0.5118,0.0,0.2671,0.5118,0.2671
Is this the first document?,0.0,0.4698,0.5803,0.3841,0.0,0.0,0.3841,0.0,0.3841


In [4]:
docs = [d.lower().strip(".,!?") for d in corpus]
doc = docs[0]
terms  = [
    "and", "document", "first", "is", "one", "second", "the", "third", "this"
]
display(doc)
display(docs)

tfs, idfs = [], []
for t in terms:
    # Note: the term-frequency is normalization is not necessary anymore
    tfs.append(doc.split().count(t))  # / len(doc.split()))
    # The 1s added to the denominator and numerator is for smoothing
    idfs.append(
        1 + np.log((1 + len(docs)) / (1 + sum([1 for d in docs if t in d])))
    )
tfs = np.array(tfs)
idfs = np.array(idfs)

tf_idfs = tfs * idfs
tf_idfs = tf_idfs / np.sqrt((tf_idfs ** 2).sum())

pd.DataFrame([tf_idfs], columns=terms, index=[doc])

'this is the first document'

['this is the first document',
 'this document is the second document',
 'and this is the third one',
 'is this the first document']

Unnamed: 0,and,document,first,is,one,second,the,third,this
this is the first document,0.0,0.4698,0.5803,0.3841,0.0,0.0,0.3841,0.0,0.3841


In [5]:
print(f"{(tf_idfs ** 2).sum():.4f}")

1.0000


---