In [30]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import math
import os

In [31]:
def display(vocab, tfidfs):
    for term in vocab:
        temp = [tfidf.get(term, 0) for tfidf in tfidfs]
        print(f"{term}: {temp}")


In [None]:
def stopword_removal(filename, folder):
    stopword_set = set(stopwords.words("english"))
    file_path = os.path.join(folder, filename)

    if not os.path.exists(file_path):
        print(f"Error: {filename} not found!")
        return []

    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    tokens = word_tokenize(text)

    words = [word.lower() for word in tokens if word.isalnum()]

    filtered_words = [word for word in words if word not in stopword_set]

    return filtered_words


In [38]:
def get_tf(tokens):
    tf = defaultdict(int)
    for token in tokens:
        tf[token] += 1
    return tf


In [None]:
def get_idf(doc_tokens, n):
    df = defaultdict(int)
    for document in doc_tokens:
        for token in set(document): 
            df[token] += 1

    idf = {}  
    for token, count in df.items(): 
        idf[token] = round(math.log10(n / count), 2)
    return idf


In [40]:
def get_tf_idf(tf, idf):
    tfidf = {}
    for term, term_freq in tf.items():
        tfidf[term] = term_freq * idf.get(term, 0)
    return tfidf


In [None]:
def main():
    corpus_size = 4
    num_docs = 3
    folder = "corpus2"

    docs = [f"file{i}.txt" for i in range(1, num_docs + 1)]

    # Check if files exist
    for doc in docs:
        if not os.path.exists(os.path.join(folder, doc)):
            print(f"Error: {doc} not found!")
            return  # Exit early if files are missing

    all_tokens = [stopword_removal(doc, folder) for doc in docs]
    print("Checkpoint 1: Tokenization Done")

    tfs = [get_tf(tokens) for tokens in all_tokens]
    idfs = get_idf(all_tokens, corpus_size)
    tfidfs = [get_tf_idf(tf, idfs) for tf in tfs]

    print("Checkpoint 2: TF-IDF Computed")

    vocabulary = set()
    for tfidf in tfidfs:
        vocabulary.update(tfidf.keys())
    vocabulary = list(vocabulary)

    display(vocabulary, tfidfs)

    weights = []
    for tfidf in tfidfs:
        weights.append(round(sum(tfidf.values()), 2))

    for doc, weight in zip(docs, weights):
        print(f"{doc}: {weight}")

if __name__ == "__main__":
    main()


Checkpoint 1: Tokenization Done
Checkpoint 2: TF-IDF Computed
part: [0, 0.6, 0]
data: [0.12, 0.12, 0.12]
scienc: [0, 0, 0.6]
used: [0, 0, 0.6]
science: [0.3, 0.3, 0]
learning: [0, 0.6, 0]
ai: [0, 0.3, 0.3]
amazing: [0.6, 0, 0]
machine: [0, 0.6, 0]
python: [0, 0, 0.6]
file1.txt: 1.02
file2.txt: 2.52
file3.txt: 2.22
