In [None]:
import numpy as np
import pandas as pd
import math
from collections import Counter

documents = [
    "The cat sat on the mat",
    "The quick brown fox jumped over the lazy dog",
    "Amazon is a big e-commerce company",
]

query = "fox jumped"

tokenized_documents = [doc.lower().split() for doc in documents]
tokenized_query = query.lower().split()

vocabulary = sorted(list(set(term for doc in tokenized_documents for term in doc)))
vocab_index = {term: i for i, term in enumerate(vocabulary)}

def calculate_tf(tokenized_docs, vocab):
    tf_vectors = []
    for doc in tokenized_docs:
        doc_counts = Counter(doc)
        total_terms = len(doc)
        tf_vector = [doc_counts.get(term, 0) / total_terms for term in vocab]
        tf_vectors.append(tf_vector)
    return np.array(tf_vectors)

tf_matrix = calculate_tf(tokenized_documents, vocabulary)

query_tf_counts = Counter(tokenized_query)
query_tf = np.array([query_tf_counts.get(term, 0) / len(tokenized_query) for term in vocabulary])


def calculate_idf(tokenized_docs, vocab):
    num_documents = len(tokenized_docs)

    doc_freq = np.zeros(len(vocab))
    for term_index, term in enumerate(vocab):

        for doc in tokenized_docs:
            if term in doc:
                doc_freq[term_index] += 1

    idf_vector = [math.log(num_documents / (df + 1)) + 1 for df in doc_freq]
    return np.array(idf_vector)

idf_vector = calculate_idf(tokenized_documents, vocabulary)

tfidf_matrix = tf_matrix * idf_vector
query_tfidf = query_tf * idf_vector

def cosine_similarity_scratch(vec1, vec2):
    """Calculates the cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0
    return dot_product / (norm_vec1 * norm_vec2)

cosine_sim = [cosine_similarity_scratch(query_tfidf, doc_tfidf) for doc_tfidf in tfidf_matrix]

ranked_indices = np.argsort(-np.array(cosine_sim))[:5]

results = pd.DataFrame({
    "Rank": range(1, 6),
    "Document": [documents[i] for i in ranked_indices],
    "Score": [cosine_sim[i] for i in ranked_indices]
})

print("Query:", query)
print("\nTop 5 Matching Documents:")
print(results.to_string(index=False))