# Information Retrieval Assignment

## 1. Importing Libraries:

In [147]:
from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import tkinter as tk
from tkinter import messagebox
import tqdm
import warnings
from tkinter import scrolledtext
warnings.filterwarnings('ignore')
corpus = fetch_20newsgroups()

## 2. Downloading Necessary NLTK Data Files:

In [149]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\96279\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\96279\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\96279\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 3. Importing Corpus:

In [151]:
corpus = fetch_20newsgroups()

# **`Phase 1`**

## 4. Removing Punctuations From Documents:

In [154]:
def remove_punctuations(text):   
    result = ''
    
    for i, letter in enumerate(text):
        if letter in '<>#!$%^&*()+-/*\n\t:,][}{"\'':
            result += ' '
        elif letter == '.' and ( i == len(text) - 1 or text[i+1] in '\n\t '):
            continue
        else:
            result += letter

    return result.lower()

## 5. Stemming Words Function:

In [156]:
def stemming(corpus):
    result = []
    stemmer = PorterStemmer()
    
    for file in corpus:
        stemmed_file = ''
        file = remove_punctuations(file)
        stemmed_words = [stemmer.stem(word) for word in file.split()]
        result.append(' '.join(stemmed_words))
        
    return result

## 6. Lemmitizing Words Function:

In [158]:
def lemmatizing(corpus):
    result = []
    lemmatizer = WordNetLemmatizer()
    
    for file in corpus:
        lemmatized_file = ''
        file = remove_punctuations(file)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in file.split()]
        lemmatized_file = ' '.join(lemmatized_words)
        result.append(lemmatized_file)
        
    return result

## 7. Constructing Inverted Index:

In [160]:
def indexing(corpus):
    index = dict()
    
    for file_index, text in enumerate(corpus):
        for word in text.split(): 
            if word in index.keys():
                index[word].add(file_index)
            else:
                index[word] = {file_index}        
        
    return dict(sorted(index.items()))

## 8. Applying Previous Steps:

In [162]:
#stemmed_corpus = stemming(corpus.data)
#index = indexing(stemmed_corpus)
## -------------------------------------
lemmatizing_corpus = lemmatizing(corpus.data)
index = indexing(lemmatizing_corpus)

# **`Phase 2`**

# 1. Pre-processing Query: 

In [165]:
def preprocess_query(query):
    
    query = app.user_query
    query = remove_punctuations(query)
    #query = stemming(query.split())
    query = lemmatizing(query.split())

    return query

## 2. Searching Inverted Index:

In [167]:
def search_index(query):  
    
    resulted_docs = []
    
    for word in query:     
        if word in index.keys():
            resulted_docs.append(index[word])   
        else: 
            return []

    return resulted_docs

## 3. Merging Algorithm:

In [169]:
def merge(files):

    if files == []:
        return None
        
    files = [sorted(list(word_file)) for word_file in files]
    files = sorted(files, key=len)

    final_results = files[0]
    
    for i in files[1:]: 
        final_results = [element for element in final_results if element in i]

    return final_results

## 4. Graphical User Interface (GUI):

In [171]:
def GUI_result(docs):

    if docs == None or len(docs) == 0:
        return "No Relevant Documents.\n"
        
    string =  f'Document ID: '
    for i in docs:
        
        string += f'{i}, '

    string += '\nThank You For Using Our System.'
        
    return string

In [172]:
class BasicIRSystemGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Basic Information Retrieval System")
        
        self.label = tk.Label(root, text="Enter your query:")
        self.label.pack(pady=10)
        
        self.query_entry = tk.Entry(root, width=50)
        self.query_entry.pack(pady=10)
        
        self.search_button = tk.Button(root, text="Search", command=self.search)
        self.search_button.pack(pady=10)
        
        self.result_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=70, height=40)
        self.result_text.pack(pady=10)
        
    
    def search(self):
        self.user_query = self.query_entry.get()

        self.result_text.delete(1.0, tk.END)  # Clear previous results
        self.result_text.insert(tk.INSERT, GUI_result(merge(search_index(preprocess_query(self.user_query)))))

root = tk.Tk()
app = BasicIRSystemGUI(root)
_ = root.mainloop()