In [None]:
import os
import nltk
import math
from decimal import Decimal
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive  #to get the Research paper folder stored in drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/InfoRetrive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/InfoRetrive


In [None]:
#read stop words from the file
def read_stop_words(stop_words_file):
    with open(stop_words_file, 'r') as f: #opens the file
        stop_words = set(f.read().splitlines()) #reats the contents nad then splits them in lines
    return stop_words

stop_words_file = "/content/drive/MyDrive/InfoRetrive/Stopword-List.txt" #path for the stop words file

stop_words = read_stop_words(stop_words_file)
print(stop_words)

{'at', 'are', '', 'has', 'we', 'am', 'do', 'once ', 'to', 'for', 'in', 'of ', 'as', 'have', 'and ', 'had', 'no', 'can', 'up', 'his', 'her', 'the', 'be', 'a', 'is ', 'on', 'all'}


In [None]:
#loads all the files into a dictonary called document
def load_documents(folder_path):
    documents = {} #an empty disctinoary is created to store text with filenames as keys
    for filename in os.listdir(folder_path): # Iterate over each document
        with open(os.path.join(folder_path, filename), 'r', encoding='cp1252') as file: # cp1252 is windows text file encoding
            documents[filename] = file.read() #reads the text and store it in dictionary with filenames as key
    return documents


documents_folder = "/content/drive/MyDrive/InfoRetrive/ResearchPapers"#path of the folder with documnets
documents = load_documents(documents_folder)
print(documents)



In [None]:
# preprocessing the document dictionary

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


def preprocess_document(text):
    url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    special_chars = r'[^\w\s]'
    number = r'\b\d+\b'

    words = re.sub(url, '', text) # URLs
    words = re.sub(special_chars, '', text)# special characters
    tokens = word_tokenize(text.lower())  # Tokenising the word and case folding which makes them all lower case
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]  # # Lemmatization
    tokens = [token for token in tokens if token not in stop_words]  # R removing stop words
    return tokens




preprocessed_documents = {}  #empty dictionary is created to store preprocessed documents
for doc_id, doc_text in documents.items(): #itterate through each documnt in documnt dictionary
    preprocessed_text = preprocess_document(doc_text) #call the function y
    preprocessed_documents[doc_id] = preprocessed_text #store the preprocessed tokens in the dicionary

print(preprocessed_documents)





[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [None]:
# calculating the TF-IDF value

def calculate_tf_idf(documents):
    term_frequency = defaultdict(int)  #term_frequency dictionary to store terms frequency (tf)
    document_frequency = defaultdict(int) #document_frequency dictionary to store documnt frequency (df)
    num_documents = len(documents) #total no. of documnts

    # Compute term frequency and document frequency
    for doc_id, tokens in documents.items(): #itterates through each doc and its tokens
        term_count = defaultdict(int) #term_count dictionary to store count of each term (cf)
        for token in tokens: #itterates through each token in the documnet
            term_count[token] += 1 # increments the count of that term

        for term, count in term_count.items():  #iterates through each term and its count
            term_frequency[(doc_id, term)] =  count / len(tokens) #calculates term frequeny
            document_frequency[term] += 1 #increments documnt frequency of that term


    inverse_document_frequency = {}  # Create an empty dictionary to store IDF values
    for term in document_frequency: # iterate throguht each term in the document_frequency dictionary
        idf = (math.log(num_documents / (document_frequency[term] + 1))) #calculate idf for the term
        inverse_document_frequency[term] = idf #store the idf value in inverse_document_frequency dictionary


    tf_idf = {}  # Create an empty dictionary to store TF_IDF values
    for (doc_id, term), tf in term_frequency.items():# iterate throught each (doc_id, term) pair and its corresponding term frequency in the term_frequency dictionary
        idf = inverse_document_frequency[term] #retrive idf vale of the term from inverse_document_frequency dictionary
        tfidf = (tf) * idf #calculate tdidf
        tf_idf[(doc_id, term)] = tfidf #store in td_idf dictionary with the (doc_id, term) pair as the key



    return tf_idf


tf_idf = calculate_tf_idf(preprocessed_documents)

print(tf_idf)




In [None]:
# represeningt documents and queries as vectors
def vectorize(tf_idf, terms):
    vector = [] #a vector is created to represent terms based off the idf values

    for term in terms:  # Iterate through each term
            found = False
            for key, value in tf_idf.items():  # Iterate through TF-IDF dictionary
                if key[1] == term:  # Check if the term matches the current term in the query
                    vector.append(value)  # Add the retrieved TF-IDF value to the vector list
                    found = True
                    break

            if not found:
                vector.append(0)  # If term isn't found, append 0 to the vector

    return vector



# compute cosine similarity between two vectors
def cosine_similarity(vector1, vector2):
    dot_product = sum(Decimal(a) * Decimal(b) for a, b in zip(vector1, vector2)) #computing the dot product
    magnitude1 = Decimal(sum(Decimal(a) ** 2 for a in vector1)).sqrt()
    magnitude2 = Decimal(sum(Decimal(b) ** 2 for b in vector2)).sqrt()
    if magnitude1 == 0 or magnitude2 == 0: #checking for zero division
        return 0
    return dot_product / (magnitude1 * magnitude2) #cosine similarity returned


In [None]:
#for the query input by user

def process_query(query, documents, tf_idf, threshold):
    relevant_documents = defaultdict(float) #relevant_documents dictionary that store rel docs using cosine similarity
    document_vectors = {}  # Store document vectors for the relevant documents


    preprocessed_query = preprocess_document(query)    # preprocess the query and get tokens
    print("Query Tokens:", preprocessed_query)


    query_vector = vectorize(tf_idf, preprocessed_query) # vectorise the query
    print("Query Vector:", query_vector)

    count=0
    for doc_id, doc_text in documents.items(): #itterate through each documnt
        doc_vector = vectorize(tf_idf, preprocess_document(doc_text)) #vectorizing each document
        count += 1  # Increment count
        # print(count,".Document ID:", doc_id)
        # print("Document Vector:", doc_vector)  # Debug: Print the document vector


        similarity = cosine_similarity(query_vector, doc_vector) #calculating cosine score between query and document vector
        print(f"{count}.Document ID: {doc_id}, Similarity: {similarity:.12f}")

        if similarity >= threshold: #if the similarity greater or equal to threshold
            relevant_documents[doc_id] = similarity #the documnet is considered relavent
            document_vectors[doc_id] = doc_vector  # store the document vector

    return relevant_documents,document_vectors

#running code
threshold = 0.005
query = input("Enter your query: ")
relevant_documents, document_vectors = process_query(query, documents, tf_idf, threshold)  # processing the query

# Print relevant documents
if not relevant_documents:
    print("No relevant documents found.")
else:
    print("Relevant Documents:")
    for doc_id, similarity in relevant_documents.items():
        print(f"Document ID: {doc_id}, Similarity: {similarity:.12f}")




Enter your query: machine learning
Query Tokens: ['machine', 'learning']
Query Vector: [0.0005461229662459459, 0.0008346407597343701]
1.Document ID: 1.txt, Similarity: 0.002694076620
2.Document ID: 2.txt, Similarity: 0.010296286953
3.Document ID: 3.txt, Similarity: 0.006214357535
4.Document ID: 16.txt, Similarity: 0.007514136564
5.Document ID: 17.txt, Similarity: 0.028385708210
6.Document ID: 18.txt, Similarity: -0.014663693642
7.Document ID: 24.txt, Similarity: 0.043393999288
8.Document ID: 25.txt, Similarity: 0.004139847999
9.Document ID: 26.txt, Similarity: 0.000404869097
10.Document ID: 21.txt, Similarity: 0.000345625669
11.Document ID: 22.txt, Similarity: -0.000222459122
12.Document ID: 23.txt, Similarity: 0.016336513237
13.Document ID: 7.txt, Similarity: 0.010368790607
14.Document ID: 8.txt, Similarity: 0.001473287372
15.Document ID: 9.txt, Similarity: 0.002001722275
16.Document ID: 11.txt, Similarity: -0.010051231594
17.Document ID: 12.txt, Similarity: -0.008419761918
18.Documen