### 1. Define source dataset and preprocess

In [29]:
class Doc:
    def __init__(self, content: str):
        doc_arr = content.split(" ")
        self.doc_len = len(doc_arr)
        self.doc_dict = dict.fromkeys(doc_arr, 0)
        for word in doc_arr:
            self.doc_dict[word] += 1
        # Calculate term frequency(TF) 
        self.tf = {}
        for word, count in self.doc_dict.items():
            self.tf[word] = count / self.doc_len
        self.tfidf = {}

    def calc_tfidf(self, idf_dict: dict):
        for word, tf in self.tf.items():
            idf = idf_dict[word]
            self.tfidf[word] = tf * idf


docA = Doc("The cat sat on my bed")
docB = Doc("The dog sat on my knees")
docC = Doc("person sat on my head")
docs = [docA, docB, docC]

In [30]:
from typing import List


# Calculate inverse document frequency(IDF)
def calc_idf(docs: List[Doc]) -> dict:
    words = set()
    for doc in docs:
        words |= doc.doc_dict.keys()
    word_show_dic = dict.fromkeys(words, 0)
    for word in words:
        for doc in docs:
            if word in doc.doc_dict.keys():
                word_show_dic[word] += 1
    import math
    n = len(docs)
    idf = {}
    for word in words:
        idf[word] = math.log10((n + 1) / (word_show_dic[word] + 1))
    return idf


print(calc_idf(docs))

{'sat': 0.0, 'head': 0.3010299956639812, 'dog': 0.3010299956639812, 'on': 0.0, 'my': 0.0, 'person': 0.3010299956639812, 'knees': 0.3010299956639812, 'cat': 0.3010299956639812, 'bed': 0.3010299956639812, 'The': 0.12493873660829993}


In [31]:
idf = calc_idf(docs)
for doc in docs:
    doc.calc_tfidf(idf)

for doc in docs:
    print(doc.tfidf)

{'The': 0.020823122768049988, 'cat': 0.050171665943996864, 'sat': 0.0, 'on': 0.0, 'my': 0.0, 'bed': 0.050171665943996864}
{'The': 0.020823122768049988, 'dog': 0.050171665943996864, 'sat': 0.0, 'on': 0.0, 'my': 0.0, 'knees': 0.050171665943996864}
{'person': 0.06020599913279624, 'sat': 0.0, 'on': 0.0, 'my': 0.0, 'head': 0.06020599913279624}
