In [88]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
doc1 = "This is document 1. It contains some text for testing."
doc2 = "Document 2 has different content for testing purposes."
doc3 = "The third document is here with unique words."

In [None]:
#let's make documents list
documents = [doc1,doc2,doc3]

In [None]:
documents

['This is document 1. It contains some text for testing.',
 'Document 2 has different content for testing purposes.',
 'The third document is here with unique words.']

In [None]:
#sample query
query = "This is a test query for information retrieval."

In [None]:
#tokenize the documents and query
query_tokens = word_tokenize(query.lower())
print(query_tokens)

document_tokens = [word_tokenize(doc.lower()) for doc in documents]
print(document_tokens)

['this', 'is', 'a', 'test', 'query', 'for', 'information', 'retrieval', '.']
[['this', 'is', 'document', '1.', 'it', 'contains', 'some', 'text', 'for', 'testing', '.'], ['document', '2', 'has', 'different', 'content', 'for', 'testing', 'purposes', '.'], ['the', 'third', 'document', 'is', 'here', 'with', 'unique', 'words', '.']]


In [None]:
#create tf-idf vectorize
vecs = TfidfVectorizer()

In [72]:
#calculate tf-idf scores for query and documents
tfidf_matrix = vecs.fit_transform([" ".join(doc) for doc in document_tokens])

query_tfidf = vecs.transform([" ".join(query_tokens)])   #to get tfidf only for matching words in the query and the document using only transorm command

print(query_tfidf)



  (0, 15)	0.680918560398684
  (0, 7)	0.5178561161676974
  (0, 4)	0.5178561161676974


In [70]:
query_tfidf.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.51785612,
        0.        , 0.        , 0.51785612, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.68091856, 0.        , 0.        , 0.        ]])

In [None]:
tfidf_matrix.toarray()

array([[0.37571621, 0.        , 0.        , 0.22190405, 0.28574186,
        0.        , 0.        , 0.28574186, 0.37571621, 0.        ,
        0.37571621, 0.28574186, 0.37571621, 0.        , 0.        ,
        0.37571621, 0.        , 0.        , 0.        ],
       [0.        , 0.4261835 , 0.4261835 , 0.25171084, 0.32412354,
        0.4261835 , 0.        , 0.        , 0.        , 0.4261835 ,
        0.        , 0.32412354, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.22440141, 0.        ,
        0.        , 0.37994462, 0.28895767, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.37994462, 0.37994462,
        0.        , 0.37994462, 0.37994462, 0.37994462]])

In [87]:
 #to calculate cosine similarity between query and documents
cosine_similarities = cosine_similarity(tfidf_matrix,query_tfidf)   #this cosine similarity score shows only for matching values
print(cosine_similarities)

[[0.55177848]
 [0.16784936]
 [0.1496385 ]]


# Champion List

Find what documents are the highest occurence in that particular term ,and ranked them according to the desending order

In [92]:
#create a champion list for each query term
champion_lists = {}

for i, term in enumerate(query_tokens):
  term_documents = tfidf_matrix[:,i].toarray().flatten()
  champion_list = np.argsort(term_documents)[::-1][:3]  # Adjsut the number of champions as needeede
  champion_lists[term] = champion_list

print(champion_lists)


{'this': array([0, 2, 1]), 'is': array([1, 2, 0]), 'a': array([1, 2, 0]), 'test': array([1, 2, 0]), 'query': array([1, 0, 2]), 'for': array([1, 2, 0]), 'information': array([2, 1, 0]), 'retrieval': array([2, 0, 1]), '.': array([0, 2, 1])}


In [94]:
#Initialize a set to store the selected documents
selected_documents = set()

#Add documenta from champion lists to the selected set
for term in query_tokens:
  selected_documents.update(champion_lists.get(term,[]))

#sort the selected documents by cosine similarity
selected_documents = list(selected_documents)
selected_documents.sort(key=lambda idx: -cosine_similarities[idx])

#print ranked documents
print("Ranked documents : ")
for idx in selected_documents:
  print(f"Documents {idx + 1}: {documents[idx]}")

Ranked documents : 
Documents 1: This is document 1. It contains some text for testing.
Documents 2: Document 2 has different content for testing purposes.
Documents 3: The third document is here with unique words.
