In [16]:
import math

# Sample documents
documents = ['cat is cute', 'dog ask questions', 'cat does not ask questions']

In [17]:
# Step 1: Tokenization and document frequency (DF) calculation
def tokenize(document):
    return document.split()

In [18]:
# Create a set of unique terms in the corpus
corpus = set()
document_frequency = {}
for document in documents:
    tokens = tokenize(document)
    for token in tokens:
        if token not in corpus:
            corpus.add(token)
        if token not in document_frequency:
            document_frequency[token] = 1
        else:
            document_frequency[token] += 1
corpus

{'ask', 'cat', 'cute', 'does', 'dog', 'is', 'not', 'questions'}

In [19]:
# Step 2: Term frequency (TF) calculation
def calculate_tf(document):
    tokens = tokenize(document)
    tf = {}
    for token in tokens:
        if token not in tf:
            tf[token] = 1
        else:
            tf[token] += 1
    return tf

In [20]:
# Step 3: Inverse Document Frequency (IDF) calculation
def calculate_idf(token):
    if token in document_frequency:
        return math.log(len(documents) / document_frequency[token] + 1)
    else:
        return 0  # Token not found in any document

In [21]:
# Step 4: Calculate TF-IDF scores
tfidf_scores = []
for document in documents:
    tf = calculate_tf(document)
    tfidf = {}
    for token in tf:
        tfidf[token] = tf[token] * calculate_idf(token)
    tfidf_scores.append(tfidf)
    
tfidf_scores

[{'cat': 0.9162907318741551,
  'is': 1.3862943611198906,
  'cute': 1.3862943611198906},
 {'dog': 1.3862943611198906,
  'ask': 0.9162907318741551,
  'questions': 0.9162907318741551},
 {'cat': 0.9162907318741551,
  'does': 1.3862943611198906,
  'not': 1.3862943611198906,
  'ask': 0.9162907318741551,
  'questions': 0.9162907318741551}]

In [22]:
# Print TF-IDF scores for each document
for i, tfidf in enumerate(tfidf_scores):
    print(f"Document {i+1} TF-IDF Scores:")
    for token, score in tfidf.items():
        print(f"{token}: {score}")
    print("\n")

Document 1 TF-IDF Scores:
cat: 0.9162907318741551
is: 1.3862943611198906
cute: 1.3862943611198906


Document 2 TF-IDF Scores:
dog: 1.3862943611198906
ask: 0.9162907318741551
questions: 0.9162907318741551


Document 3 TF-IDF Scores:
cat: 0.9162907318741551
does: 1.3862943611198906
not: 1.3862943611198906
ask: 0.9162907318741551
questions: 0.9162907318741551


