# This code implementation is a"Basic Document Similarity Search". It demonstrates how the Vector Space Model with TF-IDF can be used to calculate the similarity between a query and a set of documents. 


By comparing the TF-IDF vectors using cosine similarity, the code identifies the most similar document to the query from the given collection.

# Implementation without library

In [16]:
import math

In [17]:
# Function to calculate term frequency (TF)
def calculate_tf(term, document):
    term_frequency = document.count(term)
    return term_frequency


In [18]:
# Function to calculate inverse document frequency (IDF)
def calculate_idf(term, documents):
    total_documents = len(documents)
    documents_with_term = sum(1 for doc in documents if term in doc)
    if documents_with_term > 0:
        inverse_document_frequency = math.log(total_documents / documents_with_term)
    else:
        inverse_document_frequency = 0
    return inverse_document_frequency

In [19]:
# Function to calculate TF-IDF for a term in a document
def calculate_tfidf(term, document, documents):
    tf = calculate_tf(term, document)
    idf = calculate_idf(term, documents)
    tfidf = tf * idf
    return tfidf



In [37]:
# Sample documents
documents = [
    "This is my first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Sample query
query = "This is the first document."


In [38]:
# Calculate TF-IDF for each term in the query
query_terms = query.lower().split()
query_tfidf = []
for term in query_terms:
    tfidf = calculate_tfidf(term, query, documents)
    query_tfidf.append(tfidf)


In [40]:
query_tfidf

[0.0, 0.0, 0.28768207245178085, 0.6931471805599453, 0.6931471805599453]

In [41]:
# Calculate TF-IDF for each term in the documents
documents_tfidf = []
for doc in documents:
    doc_terms = doc.lower().split()
    doc_tfidf = []
    for term in query_terms:
        tfidf = calculate_tfidf(term, doc, documents)
        doc_tfidf.append(tfidf)
    documents_tfidf.append(doc_tfidf)



In [42]:
# Compare the query TF-IDF with document TF-IDF using cosine similarity
similarities = []
for doc_tfidf in documents_tfidf:
    dot_product = sum(x * y for x, y in zip(query_tfidf, doc_tfidf))
    query_norm = math.sqrt(sum(x ** 2 for x in query_tfidf))
    doc_norm = math.sqrt(sum(x ** 2 for x in doc_tfidf))
    cosine_similarity = dot_product / (query_norm * doc_norm)
    similarities.append(cosine_similarity)



In [44]:
# Retrieve the most similar document
most_similar_index = similarities.index(max(similarities))
most_similar_document = documents[most_similar_index]

print("Query:", query)
print("Most Similar Document:", most_similar_document)

Query: This is the first document.
Most Similar Document: This is my first document.


# Implementation with library

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [46]:
# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]



In [47]:
# Sample query
query = "This is the first document."


In [49]:
# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix for the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute the TF-IDF matrix for the query
query_tfidf = vectorizer.transform([query])

# Calculate cosine similarity between the query and documents
similarities = cosine_similarity(query_tfidf, tfidf_matrix)

# Retrieve the most similar document
most_similar_index = similarities.argmax()
most_similar_document = documents[most_similar_index]

print("Query:", query)
print("Most Similar Document:", most_similar_document)

Query: This is the first document.
Most Similar Document: This is the first document.


## Doing with and without Normalisation

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = [
    "The sun is shining brightly.",
    "The sun is a star at the center of the solar system.",
    "The moon reflects the light of the sun.",
    "Stars are distant suns in the universe."
]

# Sample query
query = "The bright sun is shining."

# Create an instance of TfidfVectorizer without normalization
vectorizer_no_norm = TfidfVectorizer(norm=None)

# Compute the TF-IDF matrix without normalization
tfidf_matrix_no_norm = vectorizer_no_norm.fit_transform(documents)

# Compute the TF-IDF matrix for the query without normalization
query_tfidf_no_norm = vectorizer_no_norm.transform([query])

# Calculate cosine similarity between the query and documents without normalization
similarities_no_norm = cosine_similarity(query_tfidf_no_norm, tfidf_matrix_no_norm)

# Create an instance of TfidfVectorizer with log normalization
vectorizer_log_norm = TfidfVectorizer(norm='l2', sublinear_tf=True)

# Compute the TF-IDF matrix with log normalization
tfidf_matrix_log_norm = vectorizer_log_norm.fit_transform(documents)

# Compute the TF-IDF matrix for the query with log normalization
query_tfidf_log_norm = vectorizer_log_norm.transform([query])

# Calculate cosine similarity between the query and documents with log normalization
similarities_log_norm = cosine_similarity(query_tfidf_log_norm, tfidf_matrix_log_norm)

print("Without Normalization:")
print("Query:", query)
print("Similarities:", similarities_no_norm)
print()

print("With Log Normalization:")
print("Query:", query)
print("Similarities:", similarities_log_norm)


Without Normalization:
Query: The bright sun is shining.
Similarities: [[0.83491976 0.40334597 0.31705854 0.07167612]]

With Log Normalization:
Query: The bright sun is shining.
Similarities: [[0.83491976 0.37655846 0.28220736 0.07167612]]


##  ------------------------------------------------------------ THANK YOU --------------------------------------------------