In [174]:
%pip install nltk

import os
import math
from collections import defaultdict
import re
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: C:\Users\Ishan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]  

In [175]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [176]:
def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    text = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    return text

In [177]:
def read_documents(directory):
    documents = {}
    try:
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                tokenized = preprocess(text)
                documents[filename] = tokenized
    except Exception as e:
        print('An error occurred:', e)
    return documents

In [178]:
def create_dictionary(documents):
    dictionary = set()
    for document in documents.values():
        dictionary.update(document)
    return dictionary

In [179]:
# Computing the term frequency

def term_frequency(documents):
    tf = defaultdict(lambda: defaultdict(int))
    for filename, tokens in documents.items():
        for token in tokens:
            tf[filename][token] += 1
    return tf

# Weighted term frequency

def weighted_term_frequency(tf):
    return 1 + math.log10(tf) if tf > 0 else 0

In [180]:
# Compute inverse-document frequency

def inverse_document_frequency(documents):
    idf = defaultdict(int)
    for doc_id, doc in documents.items():
        for term in set(doc):
            idf[term] += 1
    for term, freq in idf.items():
        idf[term] = math.log10(len(documents) / freq)
    return idf

# Compute tf-idf
def tf_idf(documents):
    tf = term_frequency(documents)
    weighted_tf = weighted_term_frequency(tf)
    idf = inverse_document_frequency(documents)
    tf_idf = defaultdict(dict)
    for doc_id, doc in documents.items():
        for term in doc:
            tf_idf[doc_id][term] = weighted_tf[doc_id][term] * idf[term]
    return tf_idf


In [181]:
def postings_list(documents, unique_words):
    postings = defaultdict(list)
    tf = term_frequency(documents)
    
    for word in unique_words:
        for filename, tokens in documents.items():
            if word in tokens:
                try:
                    wt_tf = weighted_term_frequency(tf[filename][word])
                except KeyError:
                    print(f"Warning: Term '{word}' not found in document {filename}")
                    wt_tf = 0
                postings[word].append((filename, wt_tf))
    
    return postings

In [182]:
def doc_length(documents, tf):
    doc_lengths = defaultdict(float)
    for filename, terms in tf.items():
        length = 0
        for term, freq in terms.items():
            length += (1 + math.log10(freq)) ** 2
        doc_lengths[filename] = math.sqrt(length)
    return doc_lengths

In [183]:
def query_weight_terms(query_terms, posting_list, N):
    query_weight = {}
    for term in query_terms:
        if term in posting_list:
            query_weight[term] = (1 + math.log10(query_terms.count(term))) * math.log10(N / len(posting_list[term]))
    return query_weight

In [184]:
# Calculate cosine similarity for the given document
def cosine_similarity(query_wt, doc_wt, doc_len, doc_id):
    similarity = {}

    # Calculate dot product of query and document weights
    dot_product = 0
    for term in query_wt:
        if term in doc_wt:
            dot_product += query_wt[term] * doc_wt[term]
    
    # Calculate magnitude of the query vector
    query_magnitude = 0
    for weight in query_wt.values():
        query_magnitude += weight ** 2
    query_magnitude = math.sqrt(query_magnitude)
    
    # Calculate cosine similarity
    similarity[doc_id] = dot_product / (query_magnitude * doc_len[doc_id])
    
    return similarity


In [185]:
def calculate_tf(freq):
    return 1 + math.log10(freq) if freq > 0 else 0

def rank_documents(query_weights, inverted_index, doc_lengths):
    scores = defaultdict(float)
    
    for term, query_weight in query_weights.items():
        if term in inverted_index:
            postings = inverted_index[term]
            for filename, freq in postings:
                doc_term_weight = calculate_tf(freq)
                scores[filename] += query_weight * doc_term_weight
    
    for filename in scores:
        if doc_lengths[filename] != 0:
            scores[filename] /= doc_lengths[filename]
        else:
            scores[filename] = 0
    
    ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    
    return ranked_docs

In [187]:
def index_corpus(corpus_path):
    documents = read_documents(corpus_path)
    unique_words = create_dictionary(documents)
    inverted_index = postings_list(documents, unique_words)
    tf = term_frequency(documents)
    doc_lengths = doc_length(documents, tf)
    N = len(documents)
    
    return inverted_index, doc_lengths, N


def search(query, inverted_index, doc_lengths, N):
    processed_query = preprocess(query)
    query_weights = query_weight_terms(processed_query, inverted_index, N)
    ranked_docs = rank_documents(query_weights, inverted_index, doc_lengths)
    
    # Format the output
    formatted_results = []
    for filename, score in ranked_docs[:10]:  # Get top 10 results
        formatted_results.append(f"{filename}: {score:.4f}")
    
    return formatted_results

# Example usage
corpus_path = 'Corpus'
inverted_index, doc_lengths, N = index_corpus(corpus_path)
query = 'Developing your Zomato business account and profile is a great way to boost your restaurant\'s online reputation'
ranked_docs = search(query, inverted_index, doc_lengths, N)

# Print the results
for result in ranked_docs:
    print(result)

zomato.txt: 0.3561
swiggy.txt: 0.2728
bing.txt: 0.1343
Discord.txt: 0.1201
messenger.txt: 0.1198
youtube.txt: 0.1155
instagram.txt: 0.1031
paypal.txt: 0.1014
reddit.txt: 0.0973
shakespeare.txt: 0.0922
