## Data Preprocessing

In [38]:
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [39]:

# Download NLTK resources if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# Define the path to your directory
directory = '/content/drive/MyDrive/MscIT/3rd Trimester/6th Session/AIM 600 Information Retreival - Siman Giri/Week 4 Assignment/IR Assignment/Syllabus Description Dataset'

In [41]:
stop_words = set(stopwords.words('english'))

In [42]:
# Function to remove punctuation, whitespaces, and stopwords
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing whitespaces
    text = text.strip()
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Convert to lowercase and tokenize
    tokens = word_tokenize(text.lower())
    # Remove stopwords and single-letter words
    tokens = [token for token in tokens if token not in stop_words and not token.isdigit()]
    return tokens

In [43]:
# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            text = file.read()
            # Preprocess the text
            preprocessed_tokens = preprocess_text(text)
            # Print 50 tokens for each document
            preprocessed_words = preprocessed_tokens[:50]
            print(f"Tokens for {filename}: {preprocessed_words}")

Tokens for 1.txt: ['course', 'name', 'semantic', 'web', 'technology', 'course', 'provides', 'comprehensive', 'exploration', 'fundamental', 'principles', 'concepts', 'applications', 'semantic', 'web', 'technology', 'encompassing', 'elevated', 'concepts', 'within', 'field', 'including', 'foundational', 'principles', 'semantic', 'web', 'technology', 'intricacies', 'rdf', 'resource', 'description', 'framework', 'rdf', 'schema', 'rdf', 'plus', 'sparql', 'query', 'techniques', 'linked', 'data', 'nuances', 'owl', 'web', 'ontology', 'language', 'conceptualization', 'application', 'ontologies', 'students']
Tokens for 2.txt: ['course', 'name', 'information', 'retrieval', 'course', 'provides', 'concepts', 'procedures', 'methods', 'required', 'theory', 'implementation', 'evaluation', 'information', 'retrieval', 'ir', 'systems', 'aims', 'develop', 'broad', 'understanding', 'means', 'information', 'access', 'using', 'information', 'retrieval', 'topics', 'covered', 'include', 'basics', 'information',

## Term-Document Incidence Matrix

In [None]:

from collections import defaultdict


In [None]:
# Initialize the term-document incidence matrix
term_doc_matrix = defaultdict(lambda: defaultdict(int))


In [None]:
# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            text = file.read()
            # Preprocess the text
            preprocessed_tokens = preprocess_text(text)
            # Update term-document matrix
            for token in set(preprocessed_tokens):  # Ensure each term is counted once per document
                term_doc_matrix[token][filename] += 1


In [None]:
# Print the term-document incidence matrix
print("Term-Document Incidence Matrix:")
for term, doc_counts in term_doc_matrix.items():
    print(term, doc_counts)


Term-Document Incidence Matrix:
principles defaultdict(<class 'int'>, {'1.txt': 1, '10.txt': 1, '3.txt': 1})
conceptualization defaultdict(<class 'int'>, {'1.txt': 1})
linked defaultdict(<class 'int'>, {'1.txt': 1})
demonstrations defaultdict(<class 'int'>, {'1.txt': 1})
within defaultdict(<class 'int'>, {'1.txt': 1})
schemas defaultdict(<class 'int'>, {'1.txt': 1})
processes defaultdict(<class 'int'>, {'1.txt': 1, '9.txt': 1, '10.txt': 1, '4.txt': 1})
conversant defaultdict(<class 'int'>, {'1.txt': 1})
name defaultdict(<class 'int'>, {'1.txt': 1, '2.txt': 1, '9.txt': 1, '11.txt': 1, '12.txt': 1, '10.txt': 1, '8.txt': 1, '7.txt': 1, '6.txt': 1, '5.txt': 1, '4.txt': 1, '3.txt': 1})
support defaultdict(<class 'int'>, {'1.txt': 1})
course defaultdict(<class 'int'>, {'1.txt': 1, '2.txt': 1, '9.txt': 1, '11.txt': 1, '12.txt': 1, '10.txt': 1, '8.txt': 1, '7.txt': 1, '6.txt': 1, '5.txt': 1, '4.txt': 1, '3.txt': 1})
exploration defaultdict(<class 'int'>, {'1.txt': 1})
including defaultdict(<cl

In [None]:
for term, frequencies in term_doc_matrix.items():
  print(term,frequencies)

In [None]:
# Define a function to retrieve relevant documents for a given query
def retrieve_documents(query):
    query_tokens = preprocess_text(query)
    relevant_documents = set()
    for token in query_tokens:
        if token in term_doc_matrix:
            relevant_documents.update(term_doc_matrix[token].keys())
    return relevant_documents


In [None]:
# Example query
query = "semantic web"

# Retrieve relevant documents for the query
relevant_docs = retrieve_documents(query)

# Print the relevant documents
print("Relevant documents for the query:")
for doc in relevant_docs:
    print(doc)

Relevant documents for the query:
2.txt
12.txt
1.txt


## Inverted Index Information Retrieval

In [None]:
# Initialize the inverted index dictionary and document frequency dictionary
inverted_index = defaultdict(list)
doc_freq = defaultdict(int)

In [None]:
# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            text = file.read()
            # Preprocess the text
            preprocessed_tokens = preprocess_text(text)
            # Update the inverted index and document frequency
            for token in set(preprocessed_tokens):  # Ensure each term is counted once per document
                inverted_index[token].append(filename)
                doc_freq[token] += 1

In [None]:
# Print the inverted index
for term, documents in inverted_index.items():
    print(f"Term: {term}")
    print(f"Document Frequency: {doc_freq[term]}")
    print("Posting List:")
    for doc in documents:
        print(f" - {doc}")
    print()

Term: principles
Document Frequency: 3
Posting List:
 - 1.txt
 - 10.txt
 - 3.txt

Term: conceptualization
Document Frequency: 1
Posting List:
 - 1.txt

Term: linked
Document Frequency: 1
Posting List:
 - 1.txt

Term: demonstrations
Document Frequency: 1
Posting List:
 - 1.txt

Term: within
Document Frequency: 1
Posting List:
 - 1.txt

Term: schemas
Document Frequency: 1
Posting List:
 - 1.txt

Term: processes
Document Frequency: 4
Posting List:
 - 1.txt
 - 9.txt
 - 10.txt
 - 4.txt

Term: conversant
Document Frequency: 1
Posting List:
 - 1.txt

Term: name
Document Frequency: 12
Posting List:
 - 1.txt
 - 2.txt
 - 9.txt
 - 11.txt
 - 12.txt
 - 10.txt
 - 8.txt
 - 7.txt
 - 6.txt
 - 5.txt
 - 4.txt
 - 3.txt

Term: support
Document Frequency: 1
Posting List:
 - 1.txt

Term: course
Document Frequency: 12
Posting List:
 - 1.txt
 - 2.txt
 - 9.txt
 - 11.txt
 - 12.txt
 - 10.txt
 - 8.txt
 - 7.txt
 - 6.txt
 - 5.txt
 - 4.txt
 - 3.txt

Term: exploration
Document Frequency: 1
Posting List:
 - 1.txt

Term

## Vector Space Model

## BM25 Model Implementation

### BM25Okapi Understanding

In [44]:
!pip install rank_bm25



In [45]:
from rank_bm25 import BM25Okapi

In [46]:
corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

In [47]:
tokenized_corpus = [doc.split(" ") for doc in corpus]

In [48]:
tokenized_doc = [doc.split() for doc in corpus]
tokenized_doc

[['Hello', 'there', 'good', 'man!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['How', 'is', 'the', 'weather', 'today?']]

In [49]:
tokenized_corpus = []
for doc in corpus:
  print(doc)
  tokenized_doc = doc.split()
  tokenized_corpus.append(tokenized_doc)

Hello there good man!
It is quite windy in London
How is the weather today?


In [50]:
tokenized_corpus

[['Hello', 'there', 'good', 'man!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['How', 'is', 'the', 'weather', 'today?']]

In [51]:
bm25 = BM25Okapi(tokenized_corpus)

In [52]:
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
doc_scores

array([0.        , 0.93729472, 0.        ])

In [53]:
bm25.get_top_n(tokenized_query, corpus, n=1)

['It is quite windy in London']

In [54]:
## Implementation starts from here

### Model Implementation

In [55]:
!pip install rank_bm25



In [56]:
from rank_bm25 import BM25Okapi

In [57]:
# Initialize a list to store preprocessed documents
preprocessed_docs = []

In [58]:
# Loop through each file in the directory and preprocess the text
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            text = file.read()
            preprocessed_docs.append(preprocess_text(text))

In [59]:
# Initialize BM25 model with preprocessed documents
bm25 = BM25Okapi(preprocessed_docs)

In [60]:
# Define a function to retrieve documents using BM25
def retrieve_documents(query):
    preprocessed_query = preprocess_text(query)
    scores = bm25.get_scores(preprocessed_query)
    ranked_results = sorted(zip(range(len(scores)), scores), key=lambda x: x[1], reverse=True)[:3]
    return ranked_results

In [61]:
# Example query
# Ask the user to input a query
query = input("Enter your query: ")

# Retrieve documents using BM25
ranked_documents = retrieve_documents(query)

# Print the ranked documents with filenames and the first line of content
print("Ranked Documents:")
for idx, score in ranked_documents:
    filename = os.listdir(directory)[idx]  # Get the filename corresponding to the document index
    with open(os.path.join(directory, filename), 'r') as file:
        first_line = file.readline()  # Read the first line of the file
    print(f"Document: {filename}, Score = {score}")
    print("Document Title:")
    print(first_line)
    print()


Enter your query: Information retrieval
Ranked Documents:
Document: 2.txt, Score = 5.0262308127905095
Document Title:
Course Name: Information Retrieval


Document: 9.txt, Score = 0.6585459046837796
Document Title:
Course Name: Managing Information Systems & Technology


Document: 10.txt, Score = 0.6239200766356894
Document Title:
Course Name: Information Technology Project Management 




### Relevance feedback and query expansion for BM25 Okapi

## Evaluate the performance of the OkapiB25 Model

In [None]:
import pandas as pd
import numpy as np

In [None]:
def precision_at_k(relevant_docs, retrieved_docs, k):
    relevant_docs_at_k = relevant_docs.intersection(retrieved_docs[:k])
    return len(relevant_docs_at_k) / k if k > 0 else 0

def recall_at_k(relevant_docs, retrieved_docs, k):
    relevant_docs_at_k = relevant_docs.intersection(retrieved_docs[:k])
    return len(relevant_docs_at_k) / len(relevant_docs) if len(relevant_docs) > 0 else 0

# def average_precision(relevant_docs, retrieved_docs):
#     ap = 0
#     num_relevant_docs = len(relevant_docs)
#     num_retrieved_docs = len(retrieved_docs)
#     for k in range(1, num_retrieved_docs + 1):
#         if retrieved_docs[k - 1] in relevant_docs:
#             ap += precision_at_k(relevant_docs, retrieved_docs, k)
#     return ap / min(num_relevant_docs, num_retrieved_docs)
def average_precision(relevant_docs, retrieved_docs):
    ap = 0
    num_relevant_docs = len(relevant_docs)
    num_retrieved_docs = len(retrieved_docs)
    if num_relevant_docs == 0 or num_retrieved_docs == 0:
        return 0  # Avoid division by zero
    for k in range(1, num_retrieved_docs + 1):
        if retrieved_docs[k - 1] in relevant_docs:
            ap += precision_at_k(relevant_docs, retrieved_docs, k)
    return ap / min(num_relevant_docs, num_retrieved_docs)


def mean_average_precision(relevance_data):
    map_scores = []
    for query_id, group in relevance_data.groupby('query'):
        relevant_docs = set(group[group['relevance'] == 1]['document'])
        retrieved_docs = list(group['document'])
        ap = average_precision(relevant_docs, retrieved_docs)
        map_scores.append(ap)
    return np.mean(map_scores)

def dcg(relevance, k):
    dcg_score = 0
    for i in range(min(k, len(relevance))):
        dcg_score += (2 ** relevance[i] - 1) / np.log2(i + 2)
    return dcg_score

def ndcg_at_k(relevance, k):
    sorted_relevance = sorted(relevance, reverse=True)
    ideal_dcg = dcg(sorted_relevance, k)
    actual_dcg = dcg(relevance, k)
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0


In [None]:
# Load ground truth data
file_path = "/content/drive/MyDrive/MscIT/3rd Trimester/6th Session/AIM 600 Information Retreival - Siman Giri/Week 4 Assignment/IR Assignment/Relevance Data/Relevance Document - Relevance document.csv"
ground_truth_df = pd.read_csv(file_path)

In [None]:
ground_truth_df.drop(['Not needed'], axis=1, inplace = True)

In [None]:
ground_truth_df

Unnamed: 0,query,document,relevance
0,Semantic Web Technolog,1.txt,1
1,Information retrieval System,1.txt,0
2,"RDF (Resource Description Framework), RDF Sche...",1.txt,1
3,ontologies using Web Ontology Language (OWL),1.txt,1
4,Convolution neural network,1.txt,0
5,Information Retrieval,2.txt,1
6,document pre-processing and identification of ...,2.txt,1
7,Convolution neural network,2.txt,0
8,derivative and integration,2.txt,0
9,Artificial Neural Network and Deep Learning,3.txt,1


In [None]:
# Compute evaluation metrics
precision_5 = []
recall_5 = []
map_score = mean_average_precision(ground_truth_df)
ndcg_5 = []

In [None]:
for query_id, group in ground_truth_df.groupby('query'):
    relevant_docs = set(group[group['relevance'] == 1]['document'])
    retrieved_docs = list(group['document'])

    precision_5.append(precision_at_k(relevant_docs, retrieved_docs, 5))
    recall_5.append(recall_at_k(relevant_docs, retrieved_docs, 5))

    # Extract relevance scores and convert them to individual scores
    relevance_scores = list(group['relevance'])
    relevance_scores_individual = [1 if score == 1 else 0 for score in relevance_scores]

    ndcg_5.append(ndcg_at_k(relevance_scores_individual, 5))

precision_5_mean = np.mean(precision_5)
recall_5_mean = np.mean(recall_5)
ndcg_5_mean = np.mean(ndcg_5)

In [None]:
# Print evaluation metrics
print("Mean Average Precision (MAP):", map_score)
print("Mean Precision@5:", precision_5_mean)
print("Mean Recall@5:", recall_5_mean)
print("Mean nDCG@5:", ndcg_5_mean)

Mean Average Precision (MAP): 0.5882352941176471
Mean Precision@5: 0.11764705882352941
Mean Recall@5: 0.5882352941176471
Mean nDCG@5: 0.5882352941176471
