<a href="https://colab.research.google.com/github/iami0npkr/Story/blob/main/Indexing_and_Boolean_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
# Step 1: Setting Up Google Colab and Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
# Step 2: Loading the Text Data
import os

base_path = "/content/drive/My Drive/IR_Assignment2"

# Paths for English and Hindi documents
english_path = os.path.join(base_path, "English")
hindi_path = os.path.join(base_path, "Hindi")

print("English Documents:", os.listdir(english_path))
print("Hindi Documents:", os.listdir(hindi_path))


English Documents: ['1050106_opinion_story_4215839.utf8', '1050123_opinion_story_4288590.utf8', '1050115_opinion_story_4244786.utf8', '1050112_opinion_story_4225105.utf8', '1050118_opinion_story_4173999.utf8', '1050102_opinion_story_4189087.utf8', '1050121_opinion_story_4278457.utf8', '1050120_opinion_story_4273779.utf8', '1050123_opinion_story_4288675.utf8', '1050124_opinion_story_4289352.utf8', '1050112_opinion_story_4240758.utf8', '1050116_opinion_story_4257986.utf8', '1050107_opinion_story_4221153.utf8', '1050118_opinion_story_4253867.utf8', '1050114_opinion_story_4249175.utf8', '1050107_opinion_story_4224619.utf8', '1050116_opinion_index.utf8', '1050122_opinion_story_4274525.utf8', '1050106_opinion_story_4216857.utf8', '1050126_opinion_story_4297794.utf8', '1050120_opinion_story_4258179.utf8', '1050107_opinion_story_4215937.utf8', '1050112_opinion_story_4240756.utf8', '1050111_opinion_story_4229804.utf8', '1050125_opinion_story_4244823.utf8', '1050110_opinion_story_4215355.utf8', 

In [63]:
# Step 3: Installing and Importing Required Libraries
!pip install nltk
!pip install indic-nlp-library
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [64]:
import re
import nltk
import string
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize import indic_normalize

# Downloading NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
# Step 4: Define Preprocessing Functions
def preprocess_english(text):

    text = text.lower()

    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens


In [66]:
import re
import sys

# Install dependencies (Run once)
!pip install indic-nlp-library
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

# Add Indic NLP resource path
sys.path.append("/content/indic_nlp_resources")

# Now import correctly
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

# Manually define Hindi stopwords (since NLTK lacks them)
hindi_stopwords = set(["का", "की", "के", "है", "हैं", "को", "से", "में", "पर", "और", "कि", "तो", "इस", "उस",
                       "थे", "था", "अपने", "नहीं", "भी", "यह", "हो", "गया", "कर", "रहे", "किया", "एक"])  # Add more as needed

def preprocess_hindi(text):
    # Normalize Unicode text (important for Hindi)
    normalizer = IndicNormalizerFactory().get_normalizer("hi")
    text = normalizer.normalize(text)

    # Remove punctuation and numbers
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Tokenize using indicnlp
    tokens = indic_tokenize.trivial_tokenize(text)

    # Remove stopwords and ensure the token is not empty
    tokens = [word for word in tokens if word not in hindi_stopwords and word.strip() != '']

    return tokens


fatal: destination path 'indic_nlp_resources' already exists and is not an empty directory.


In [80]:
# Step 5: Loading and Processing the Documents
def process_documents(folder_path, language):
    processed_docs = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

            if language == "english":
                tokens = preprocess_english(text)
            else:
                tokens = preprocess_hindi(text)

            processed_docs[filename] = tokens

    return processed_docs


In [68]:
# Processing both English and Hindi documents
english_docs = process_documents(english_path, "english")
hindi_docs = process_documents(hindi_path, "hindi")

print("Sample English Processed:", list(english_docs.items())[0])
print("Sample Hindi Processed:", list(hindi_docs.items())[0])


Sample English Processed: ('1050106_opinion_story_4215839.utf8', ['doc', 'docnoopinionstoryutfdocno', 'text', 'telegraph', 'calcutta', 'opinion', 'thursday', 'january', 'local', 'monitor', 'prudent', 'government', 'india', 'reacted', 'strongly', 'united', 'state', 'america', 'naval', 'presence', 'sri', 'lanka', 'u', 'navy', 'reportedly', 'directed', 'provide', 'humanitarian', 'assistance', 'tsunami', 'victim', 'island', 'state', 'u', 'presence', 'need', 'watched', 'monitored', 'carefully', 'new', 'delhi', 'latest', 'american', 'engagement', 'south', 'asia', 'may', 'also', 'serve', 'way', 'test', 'depth', 'bilateral', 'indous', 'relationship', 'believed', 'many', 'marine', 'assault', 'ship', 'sent', 'washington', 'provide', 'help', 'sri', 'lankan', 'government', 'well', 'known', 'nearly', 'sri', 'lankans', 'killed', 'tsunami', 'disaster', 'thousand', 'others', 'displaced', 'conservative', 'thinking', 'india', 'naturally', 'alarmed', 'apparent', 'u', 'intrusion', 'india', 'backwater', 'i

In [94]:
# Step 6: Create an Inverted Index for posting lists
import pandas as pd
from collections import defaultdict

def build_index(processed_docs):

    index = defaultdict(lambda: {"doc_freq": 0, "posting_list": {}})

    for doc_id, tokens in processed_docs.items():
        term_counts = defaultdict(int)


        for term in tokens:
            term_counts[term] += 1


        for term, freq in term_counts.items():

            if term not in index:
                index[term]["doc_freq"] = 1
            else:
                index[term]["doc_freq"] += 1


            index[term]["posting_list"][doc_id] = freq

    return index

import pprint

def print_full_index(index):
    pprint.pprint(index)

def print_index(index):
    for term, details in index.items():
        print(f"Term: {term}, Doc Frequency: {details['doc_freq']}, Posting List: {details['posting_list']}")

english_docs = process_documents(english_path, "english")
hindi_docs = process_documents(hindi_path, "hindi")

english_index = build_index(english_docs)
hindi_index = build_index(hindi_docs)




In [95]:
print("\nFull English Index Structure:")
print_full_index(english_index)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                      '1050128_opinion_story_4303240.utf8': 3}},
             'twoandahalf': {'doc_freq': 2,
                             'posting_list': {'1050120_opinion_story_4206267.utf8': 1,
                                              '1050123_opinion_story_4285879.utf8': 1}},
             'twoberth': {'doc_freq': 1,
                          'posting_list': {'1050123_opinion_story_4269858.utf8': 1}},
             'twocoalition': {'doc_freq': 1,
                              'posting_list': {'1050116_opinion_story_4260633.utf8': 1}},
             'twofacedness': {'doc_freq': 1,
                              'posting_list': {'1050128_opinion_story_4298217.utf8': 1}},
             'twofifths': {'doc_freq': 1,
                           'posting_list': {'1050125_opinion_story_4292623.utf8': 1}},
             'twohour': {'doc_freq': 1,
                         'posting_list': {'1050102_opinion_story_420

In [92]:
print("\nFull Hindi Index Structure:")
print_full_index(hindi_index)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                     'default_cur_1_date_1_7_2005.utf8': 2,
                                     'default_cur_1_date_1_7_2006.utf8': 3,
                                     'default_cur_1_date_1_7_2007.utf8': 1,
                                     'default_cur_1_date_1_8_2004.utf8': 1,
                                     'default_cur_1_date_1_8_2005.utf8': 1,
                                     'default_cur_1_date_1_8_2006.utf8': 1,
                                     'default_cur_1_date_1_9_2004.utf8': 1,
                                     'default_cur_1_date_1_9_2007.utf8': 2,
                                     'default_cur_1_date_2_10_2004.utf8': 1,
                                     'default_cur_1_date_2_10_2005.utf8': 1,
                                     'default_cur_1_date_2_10_2006.utf8': 1,
                                     'default_cur_1_date_2_10_2007.utf8': 1,
                   

In [70]:
# Create index for both languages
english_index = build_index(english_docs)
hindi_index = build_index(hindi_docs)

# Print sample index
print("English Index Sample:", list(english_index.items())[:5])
print("Hindi Index Sample:", list(hindi_index.items())[:5])


English Index Sample: [('doc', {'doc_freq': 200, 'posting_list': {'1050106_opinion_story_4215839.utf8': 2, '1050123_opinion_story_4288590.utf8': 2, '1050115_opinion_story_4244786.utf8': 2, '1050112_opinion_story_4225105.utf8': 2, '1050118_opinion_story_4173999.utf8': 2, '1050102_opinion_story_4189087.utf8': 2, '1050121_opinion_story_4278457.utf8': 2, '1050120_opinion_story_4273779.utf8': 2, '1050123_opinion_story_4288675.utf8': 2, '1050124_opinion_story_4289352.utf8': 2, '1050112_opinion_story_4240758.utf8': 2, '1050116_opinion_story_4257986.utf8': 2, '1050107_opinion_story_4221153.utf8': 2, '1050118_opinion_story_4253867.utf8': 2, '1050114_opinion_story_4249175.utf8': 2, '1050107_opinion_story_4224619.utf8': 2, '1050116_opinion_index.utf8': 2, '1050122_opinion_story_4274525.utf8': 2, '1050106_opinion_story_4216857.utf8': 2, '1050126_opinion_story_4297794.utf8': 2, '1050120_opinion_story_4258179.utf8': 2, '1050107_opinion_story_4215937.utf8': 2, '1050112_opinion_story_4240756.utf8': 2,

In [72]:
# Step 7: Implementing Boolean Retrieval
def boolean_retrieval(query, index, operation="AND"):
    query_terms = query.lower().split()

    result_sets = []
    for term in query_terms:
        if term in index:
            result_sets.append(set(index[term]["posting_list"].keys()))
        else:
            result_sets.append(set())

    if operation == "AND":
        return set.intersection(*result_sets) if result_sets else set()

    elif operation == "OR":
        return set.union(*result_sets) if result_sets else set()

    elif operation == "NOT":
        all_docs = set()
        for term in index:
            all_docs.update(index[term]["posting_list"].keys())
        return all_docs - result_sets[0] if result_sets else all_docs

    else:
        return set()


In [73]:

print("AND Query (English):", boolean_retrieval("data analysis", english_index, "AND"))
print("OR Query (English):", boolean_retrieval("data mining", english_index, "OR"))
print("NOT Query (English):", boolean_retrieval("information", english_index, "NOT"))


AND Query (English): {'1050102_opinion_story_4201526.utf8', '1050110_opinion_story_4225685.utf8'}
OR Query (English): {'1050102_opinion_story_4201526.utf8', '1050125_opinion_index.utf8', '1050125_opinion_story_4293105.utf8', '1050101_opinion_story_4193684.utf8', '1050114_opinion_story_4249773.utf8', '1050111_opinion_story_4236139.utf8', '1050110_opinion_story_4225685.utf8', '1050111_opinion_story_4229804.utf8', '1050128_opinion_story_4303240.utf8', '1050126_opinion_story_4286238.utf8', '1050125_opinion_story_4293326.utf8', '1050123_opinion_story_4288675.utf8', '1050117_opinion_story_4244797.utf8', '1050110_opinion_story_4123345.utf8'}
NOT Query (English): {'1050124_opinion_story_4289352.utf8', '1050114_opinion_story_4244828.utf8', '1050111_opinion_story_4236139.utf8', '1050121_opinion_story_4173980.utf8', '1050126_opinion_story_4297209.utf8', '1050109_opinion_story_4229811.utf8', '1050118_opinion_index.utf8', '1050125_opinion_story_4244823.utf8', '1050125_opinion_index.utf8', '1050123_

In [93]:

print("AND Query (Hindi):", boolean_retrieval("मन कहर", hindi_index, "AND"))
print("OR Query (Hindi):", boolean_retrieval("ढह गई", hindi_index, "OR"))
print("NOT Query (Hindi):", boolean_retrieval("ओर", hindi_index, "NOT"))



AND Query (Hindi): {'default_cur_1_date_1_1_2006.utf8'}
OR Query (Hindi): {'default_cur_1_date_3_5_2007.utf8', 'default_cur_1_date_3_7_2005.utf8', 'default_cur_1_date_1_7_2005.utf8', 'default_cur_1_date_3_6_2004.utf8', 'default_cur_1_date_2_1_2006.utf8', 'default_cur_1_date_6_2_2005.utf8', 'default_cur_1_date_1_10_2004.utf8', 'default_cur_1_date_4_3_2005.utf8', 'default_cur_1_date_1_4_2006.utf8', 'default_cur_1_date_5_7_2004.utf8', 'default_cur_1_date_1_5_2005.utf8', 'default_cur_1_date_1_3_2006.utf8', 'default_cur_1_date_2_10_2005.utf8', 'default_cur_1_date_5_12_2006.utf8', 'default_cur_1_date_2_9_2006.utf8', 'default_cur_1_date_5_2_2007.utf8', 'default_cur_1_date_1_5_2007.utf8', 'default_cur_1_date_1_6_2006.utf8', 'default_cur_1_date_3_12_2006.utf8', 'default_cur_1_date_2_6_2005.utf8', 'default_cur_1_date_3_12_2005.utf8', 'default_cur_1_date_3_6_2007.utf8', 'default_cur_1_date_2_3_2006.utf8', 'default_cur_1_date_4_8_2007.utf8', 'default_cur_1_date_3_2_2005.utf8', 'default_cur_1_date_

In [None]:
1. This workflow cleans, tokenizes, removes stopwords, stems/lemmatizes, and builds an index.
2. It supports Boolean retrieval for AND, OR, NOT queries.
3. Works for both English and Hindi documents.