In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [2]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [3]:
def compute_tf(doc):
    tf_dict = {}
    words = doc.split()
    total_terms = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] /= total_terms
    return tf_dict

In [4]:
def compute_idf(corpus):
    idf_dict = {}
    N = len(corpus)
    all_words = set(word for doc in corpus for word in doc.split())
    for word in all_words:
        df = sum(word in doc.split() for doc in corpus)
        idf_dict[word] = math.log(N / (df))  # no smoothing
    return idf_dict

In [5]:
def compute_tfidf(corpus):
    idf = compute_idf(corpus)
    tfidf_docs = []
    for doc in corpus:
        tf = compute_tf(doc)
        tfidf = {word: tf[word] * idf[word] for word in tf}
        tfidf_docs.append(tfidf)
    return tfidf_docs

In [6]:
manual_tfidf = compute_tfidf(corpus)

In [7]:
print("\nManual TF-IDF:")
df_manual = pd.DataFrame(manual_tfidf).fillna(0)
print(df_manual.T)


Manual TF-IDF:
                  0         1         2
the        0.000000  0.000000  0.000000
sun        0.081093  0.000000  0.057924
is         0.081093  0.081093  0.000000
a          0.081093  0.081093  0.000000
star       0.219722  0.000000  0.000000
moon       0.000000  0.081093  0.057924
satellite  0.000000  0.219722  0.000000
and        0.000000  0.000000  0.156945
are        0.000000  0.000000  0.156945
celestial  0.000000  0.000000  0.156945
bodies     0.000000  0.000000  0.156945


In [8]:
# Step 4: Compare with scikit-learn
count_vec = CountVectorizer()
X_count = count_vec.fit_transform(corpus)
df_count = pd.DataFrame(X_count.toarray(), columns=count_vec.get_feature_names_out())
print("\nCountVectorizer:")
print(df_count.T)


CountVectorizer:
           0  1  2
and        0  0  1
are        0  0  1
bodies     0  0  1
celestial  0  0  1
is         1  1  0
moon       0  1  1
satellite  0  1  0
star       1  0  0
sun        1  0  1
the        1  1  1


In [9]:
# TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(corpus)
df_sklearn_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vec.get_feature_names_out())
print("\nScikit-learn TF-IDF:")
print(df_sklearn_tfidf.T)


Scikit-learn TF-IDF:
                  0         1         2
and        0.000000  0.000000  0.426184
are        0.000000  0.000000  0.426184
bodies     0.000000  0.000000  0.426184
celestial  0.000000  0.000000  0.426184
is         0.480458  0.480458  0.000000
moon       0.000000  0.480458  0.324124
satellite  0.000000  0.631745  0.000000
star       0.631745  0.000000  0.000000
sun        0.480458  0.000000  0.324124
the        0.373119  0.373119  0.251711
