In [233]:
%pip install nltk

import os
import math
from collections import defaultdict
import re
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: C:\Users\Ishan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]  

In [234]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [235]:
def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    text = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    return text

In [236]:
# Read documents from a directory
def read_documents(directory):
    documents = {}
    try:
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                tokenized = preprocess(text)
                documents[filename] = tokenized
    except Exception as e:
        print('An error occurred:', e)
    return documents

In [237]:
# Creating the dictionary of all words in the documents
def create_dictionary(documents):
    dictionary = set()
    for document in documents.values():
        dictionary.update(document)
    return dictionary

In [238]:
# Computing the term frequency

def term_frequency(documents):
    tf = defaultdict(lambda: defaultdict(int))
    for filename, tokens in documents.items():
        for token in tokens:
            tf[filename][token] += 1
    return tf

# Weighted term frequency

def weighted_term_frequency(tf):
    return 1 + math.log10(tf) if tf > 0 else 0

In [239]:
def calculate_document_frequencies(posting_list):
    # Implement document frequency calculation
    document_frequencies = {}
    for term, postings in posting_list.items():
        document_frequencies[term] = len(postings)
    return document_frequencies


In [240]:
# Computing the inverse document frequency
def postings_list(documents, unique_words):
    # Initialize postings list as a defaultdict of lists
    postings = defaultdict(list)
    
    # Compute term frequency for the documents
    tf = term_frequency(documents)
    
    # Iterate over each unique word
    for word in unique_words:
        # Iterate over each document and its tokens
        for filename, tokens in documents.items():
            # Check if the word is in the document tokens
            if word in tokens:
                try:
                    # Calculate weighted term frequency
                    wt_tf = weighted_term_frequency(tf[filename][word])
                except KeyError:
                    # Handle case where term is not found in the document
                    print(f"Warning: Term '{word}' not found in document {filename}")
                    wt_tf = 0
                # Append the filename and weighted term frequency to the postings list
                postings[word].append((filename, wt_tf))
    
    return postings

In [241]:
# Computing the document length

def doc_length(tf):
    doc_lengths = defaultdict(float)
    for filename, terms in tf.items():
        length = 0
        for term, freq in terms.items():
            length += (1 + math.log10(freq)) ** 2
        doc_lengths[filename] = math.sqrt(length)
    return doc_lengths

In [243]:
# Calculate cosine similarity for the given document
def cosine_similarity(query_wt, doc_wt, doc_len, doc_id):
    similarity = {}

    # Calculate dot product of query and document weights
    dot_product = 0
    for term in query_wt:
        if term in doc_wt:
            dot_product += query_wt[term] * doc_wt[term]
    
    # Calculate magnitude of the query vector
    query_magnitude = 0
    for weight in query_wt.values():
        query_magnitude += weight ** 2
    query_magnitude = math.sqrt(query_magnitude)
    
    # Calculate cosine similarity
    similarity[doc_id] = dot_product / (query_magnitude * doc_len[doc_id])
    
    return similarity


In [244]:
def calculate_tf(freq):
    # Calculate term frequency using log normalization
    return 1 + math.log10(freq) if freq > 0 else 0

def calculate_idf(df, N):
    # Calculate inverse document frequency
    return math.log10(N / df)

def rank_documents(documents, query, posting_list, document_frequencies, unique_words):
    N = len(documents)  # Total number of documents
    query_tokens = preprocess(query)  # Preprocess the query
    unique_words_query = set(query_tokens)  # Unique words in the query
    unique_words = unique_words.union(unique_words_query)  # Combine unique words from documents and query
    query_vector = {}

    # Calculate query tf-idf weights (ltc scheme)
    for word in unique_words:
        tf = query_tokens.count(word)  # Term frequency in the query
        df = document_frequencies.get(word, 0)  # Document frequency of the term
        if df > 0:
            idf = calculate_idf(df, N)  # Inverse document frequency
            query_vector[word] = calculate_tf(tf) * idf  # tf-idf weight for the query term

    # Calculate document lengths (for cosine similarity)
    doc_lengths = defaultdict(float)
    for word, postings in posting_list.items():
        for doc, log_tf in postings:
            doc_lengths[doc] += log_tf ** 2
    for doc in doc_lengths:
        doc_lengths[doc] = math.sqrt(doc_lengths[doc])  # Finalize document lengths

    # Calculate cosine similarities
    similarities = {}
    for doc_name in documents.keys():
        doc_vector = {}
        for word in unique_words:
            posting = posting_list.get(word, [])
            for doc, log_tf in posting:
                if doc == doc_name:
                    doc_vector[word] = log_tf  # Document vector for the term

        # Compute cosine similarity
        similarity = cosine_similarity(query_vector, doc_vector, doc_lengths, doc_name)
        similarities.update(similarity)

    # Sort documents by similarity and return all ranked documents
    ranked_docs = sorted(similarities.items(), key=lambda item: (-item[1], item[0]))
    return ranked_docs

In [247]:
def index_corpus(corpus_path):
    documents = read_documents(corpus_path)
    unique_words = create_dictionary(documents)
    posting_list = postings_list(documents, unique_words)
    document_frequencies = calculate_document_frequencies(posting_list)
    
    return documents, posting_list, document_frequencies, unique_words

def search(query, documents, posting_list, document_frequencies, unique_words):
    ranked_docs = rank_documents(documents, query, posting_list, document_frequencies, unique_words)
    
    # Format the output
    formatted_results = []
    for filename, score in ranked_docs[:10]:  # Get top 10 results
        formatted_results.append(f"{filename}: {score:.4f}")
    
    return formatted_results

# Example usage
corpus_path = 'Corpus'
documents, posting_list, document_frequencies, unique_words = index_corpus(corpus_path)

query1 = 'Developing your Zomato business account and profile is a great way to boost your restaurant\'s online reputation'
ranked_docs = search(query1, documents, posting_list, document_frequencies, unique_words)

query2 = "Warwickshire, came from an ancient family and was the heiress to some land"
ranked_docs2 = search(query2, documents, posting_list, document_frequencies, unique_words)

# Print the results
print("Ranked document by relevance to query 1:")
for result in ranked_docs:
    print(result)
    
print("\n")

print("Ranked document by relevance to query 2:")
for result in ranked_docs2:
    print(result)

Ranked document by relevance to query 1:
zomato.txt: 0.2163
swiggy.txt: 0.1350
messenger.txt: 0.0608
instagram.txt: 0.0598
Discord.txt: 0.0548
bing.txt: 0.0524
youtube.txt: 0.0494
paypal.txt: 0.0481
reddit.txt: 0.0454
flipkart.txt: 0.0416


Ranked document by relevance to query 2:
shakespeare.txt: 0.1235
levis.txt: 0.0260
google.txt: 0.0216
nike.txt: 0.0194
zomato.txt: 0.0178
Adobe.txt: 0.0167
huawei.txt: 0.0145
skype.txt: 0.0126
blackberry.txt: 0.0119
reliance.txt: 0.0111
