In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')

In [2]:
import math
import os
import re
import difflib
from collections import Counter, defaultdict
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))

Loading the Corpus...

This cell reads all the text files from the Corpus folder and stores them in a dictionary.

The key is the filename (ex: zomato.txt). And, the value is the text content of the corresponding document.

It also prints the total number of documents loaded to confirm that the dataset is ready for processing.

In [3]:
# Initializing an empty dictionary to store the documents
corpus = {}
# Defining the folder path
folder_path = 'Corpus'

# Looping through each file in the Corpus directory, and reading the content of the files
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            corpus[filename] = file.read()

# Printing the total number of documents loaded
print(f"Total documents loaded: {len(corpus)}")

Total documents loaded: 41


Preprocessing text documents...

In [4]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    return tokens

Mapping filenames...

In [5]:
filenames = sorted(corpus.keys())   # ensuring consistent ordering
file2id = {fname: i+1 for i, fname in enumerate(filenames)}   # starting the docIDs from 1
id2file = {i+1: fname for i, fname in enumerate(filenames)}

Setting up Inverted Index and Document Lengths...

In [6]:
inverted_index = defaultdict(list)
doc_lengths = defaultdict(float)
for fname, content in corpus.items():
    doc_id = file2id[fname]   
    tokens = preprocess(content)
    tf = Counter(tokens)
    for term, freq in tf.items():
        inverted_index[term].append((doc_id, freq))
    doc_lengths[doc_id] = sum((1 + math.log10(f))**2 for f in tf.values())**0.5

Computing Document Frequency and IDF...

In [7]:
N = len(corpus)
idf = {}
for term, postings in inverted_index.items():
    df = len(postings)
    idf[term] = math.log10(N / df)

Implementing the Soundex Algorithm...

In [8]:
def soundex(name):
    name = name.upper()
    mapping = {"BFPV": "1", "CGJKQSXZ": "2", "DT": "3", "L": "4", "MN": "5", "R": "6"}
    sound = name[0]
    for char in name[1:]:
        for key in mapping:
            if char in key:
                code = mapping[key]
                if code != sound[-1]:
                    sound += code
                break
        else:
            sound += '0'
    sound = sound[:4].ljust(4, '0')
    return sound    

In [9]:
# Building a Soundex dictionary from terms in the inverted index
soundex_dict = defaultdict(set)
for term in inverted_index.keys():
    code = soundex(term)
    soundex_dict[code].add(term)
# Printing some soundex codes and terms (some examples)
print("Some Soundex entries:")
for code, terms in list(soundex_dict.items())[:5]:
    print(f"{code}: {list(terms)[:5]}")

Some Soundex entries:
W003: ['whatsapp', 'waiting', 'weather', 'whether', 'what']
I200: ['ishihara', 'ig', 'igoogle', 'issues', 'ico']
L010: ['level', 'labour', 'lobbyist', 'living', 'lifewire']
W043: ['weltsek', 'wilderness', 'walter', 'walt', 'wild']
A000: ['a', 'away', 'ahead', 'ai']


Carrying out Query Processing with Soundex Integration...

In [10]:
def process_query(query):
    tokens = preprocess(query)
    tf = Counter(tokens)
    query_vec = defaultdict(float)
    for term, freq in tf.items():
        if term in idf:
            # Calculating the TF-IDF score for the term if it exists in the index
            query_vec[term] += (1 + math.log10(freq)) * idf[term]
        else:
            # If the term is not found, check Soundex matches
            code = soundex(term)
            similar_terms = soundex_dict.get(code, [])
            if similar_terms:
                weight = (1 + math.log10(freq))
                # Spreading the weight equally among similar terms
                for sim_term in similar_terms:
                    query_vec[sim_term] += weight * idf.get(sim_term, 0)
            else:
                pass	
    # Normalizing the query vector
    norm = sum(v**2 for v in query_vec.values())**0.5
    if norm > 0:
        for term in query_vec:
            query_vec[term] /= norm
    return query_vec

Computing Cosine Similarity between the query vector and each document vector...

In [11]:
def cosine_similarity(query_vec):
    scores = defaultdict(float)
    for term, q_weight in query_vec.items():
        for doc_id, freq in inverted_index.get(term, []):
            d_weight = 1 + math.log10(freq)
            scores[doc_id] += q_weight * d_weight
    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]
    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))

Searching and Ranking Documents...

In [12]:
def search(query):
    query_vec = process_query(query)
    results = cosine_similarity(query_vec)
    print("The Top results:")
    for i, (doc_id, score) in enumerate(results[:10], start=1):
        print(f"{i}. {id2file[doc_id]}: {score:.4f}") 

In [13]:
query1 = "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation"
query2 = "Warwickshire, came from an ancient family and was the heiress to some land"
print("Query 1 Results -")
search(query1)
print("\nQuery 2 Results -")
search(query2)

Query 1 Results -
The Top results:
1. zomato.txt: 0.1825
2. swiggy.txt: 0.0919
3. messenger.txt: 0.0630
4. instagram.txt: 0.0612
5. reddit.txt: 0.0518
6. skype.txt: 0.0468
7. youtube.txt: 0.0467
8. bing.txt: 0.0463
9. yahoo.txt: 0.0372
10. Uber.txt: 0.0354

Query 2 Results -
The Top results:
1. shakespeare.txt: 0.1081
2. levis.txt: 0.0275
3. nike.txt: 0.0204
4. huawei.txt: 0.0170
5. zomato.txt: 0.0160
6. Adobe.txt: 0.0158
7. blackberry.txt: 0.0158
8. reliance.txt: 0.0143
9. skype.txt: 0.0136
10. Uber.txt: 0.0117


## Additional Modifications

- Stopword Removal

In [14]:
# Loading English stopwords
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    # Removing stopwords
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

- Lemmatization

In [15]:
lemmatizer = WordNetLemmatizer()
# Mapping POS tag to WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    pos_tags = nltk.pos_tag(tokens) 
    tokens = [lemmatizer.lemmatize(t, get_wordnet_pos(tag)) for t, tag in pos_tags]
    return tokens

- Synonym Expansion (WordNet)

In [16]:
def expand_with_synonyms(term):
    synonyms = set()
    for syn in wordnet.synsets(term):
        for lemma in syn.lemmas():
            word = lemma.name().lower().replace("_", " ")
            if word != term:
                synonyms.add(word)
    return list(synonyms)
# Printing examples
print(expand_with_synonyms("car"))

['railway car', 'cable car', 'auto', 'automobile', 'machine', 'railcar', 'elevator car', 'railroad car', 'motorcar', 'gondola']


- Spell Correction

In [17]:
def suggest_term(term):
    candidates = inverted_index.keys()
    closest = difflib.get_close_matches(term, candidates, n=1, cutoff=0.7)
    return closest[0] if closest else None
# Printing an example
print(suggest_term("instagraam")) 

instagram


- Highlighting Results

In [18]:
def highlight_snippet(doc_id, query_terms, max_len=120):
    text = corpus[id2file[doc_id]]  # Original text
    snippet = text[:max_len]
    for term in query_terms:
        snippet = re.sub(f"\\b{term}\\b", f"[{term.upper()}]", snippet, flags=re.IGNORECASE)
    return snippet + "..."
# Modified search with highlighting
def search(query):
    query_vec = process_query(query)
    results = cosine_similarity(query_vec)
    tokens = preprocess(query)
    print("The Top results:")
    for i, (doc_id, score) in enumerate(results[:10], start=1):
        fname = id2file.get(doc_id, doc_id)   
        snippet = highlight_snippet(doc_id, tokens)
        print(f"{i}. {fname} (score={score:.4f})")
        print(f"   Snippet: {snippet}\n")

In [None]:
# Testing queries
# 1: Regular query (should work as before, but with stopwords removed & stemming applied)
print("Query 1: Zomato business account:")
search("Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation")
 
# 2: Shakespeare query (historical style, synonym expansion could be applied)
print("\nQuery 2: Warwickshire heritage:")
search("Warwickshire, came from an ancient family and was the heiress to some land")

# 3: Misspelled terms (Soundex + spell correction should help in fixing these)
print("\nQuery 3: Misspelled words:")
search("Instagraam messanger zomato restraunt profle")

# 4: Synonym expansion (WordNet should expand 'car')
print("\nQuery 4: Synonym test:")
search("car company technology")

# 5: Stopwords removal check (searching with filler words should still give good results)
print("\nQuery 5: Stopwords check:")
search("the and of in zomato business the restaurant profile")

Query 1: Zomato business account:
The Top results:
1. zomato.txt (score=0.1734)
   Snippet: If you are [A] [RESTAURANT] owner or marketing manager for [A] [RESTAURANT], you’ll love [ZOMATO]. But what is [ZOMATO], exactly, an...

2. swiggy.txt (score=0.1030)
   Snippet: What Is Swiggy [AND] How It’s Working?

Swiggy is one of the topmost players in the [ONLINE] food ordering & delivery indust...

3. messenger.txt (score=0.0646)
   Snippet: what is messenger?

Users simply download the app [TO] their mobile/tablet device; the app can also [BE] used on desktop. Us...

4. instagram.txt (score=0.0622)
   Snippet: What is instagram?

With over [A] billion registered accounts, Instagram, which was bought by Facebook in 2012, has become...

5. reddit.txt (score=0.0534)
   Snippet: What is reddit?

If you spend [A] lot of time [ONLINE], chances are you’ve heard of Reddit. The site bills itself as the “fr...

6. skype.txt (score=0.0490)
   Snippet: What is skype?

Skype is [A] VoIP service that 