# Import Required Libraries
Import the necessary libraries, including os for file handling, nltk for text processing, and pandas for data manipulation.

In [2]:

# Importing required libraries

# os library for file handling
import os

# nltk library for text processing
import nltk

# pandas library for data manipulation
import pandas as pd


# Load and Preprocess Documents
Load all documents from the data folder and preprocess them by removing stop words, stemming, and tokenization.

In [4]:

# Importing additional required libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


# Defining the stemmer
stemmer = PorterStemmer()

# Defining the set of stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stop words and stemming
    processed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    
    return processed_tokens

# Loading and preprocessing documents
documents = {}
for filename in os.listdir('data'):
    filepath = os.path.join('data', filename)
    if os.path.isfile(filepath):  # Check if the path is a file
        with open(filepath, 'r') as file:
            text = file.read()
            documents[filename] = preprocess_text(text)

# Converting the dictionary to a pandas DataFrame
documents_df = pd.DataFrame(list(documents.items()), columns=['Filename', 'Processed Text'])

# Create Inverted Index
Create an inverted index that maps each word to the documents that contain it.

In [5]:
# Importing defaultdict from collections
from collections import defaultdict

# Function to create inverted index
def create_inverted_index(documents_df):
    # Initializing the inverted index as a defaultdict of sets
    inverted_index = defaultdict(set)
    
    # Iterating over each document
    for index, row in documents_df.iterrows():
        # Iterating over each word in the document
        for word in row['Processed Text']:
            # Adding the document to the set of documents containing the word
            inverted_index[word].add(row['Filename'])
    
    return inverted_index

# Creating the inverted index
inverted_index = create_inverted_index(documents_df)

# Define Scoring Algorithm
Define a scoring algorithm, such as TF-IDF or BM25, to rank the documents based on their relevance to the search query.

In [6]:
# Importing additional required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate TF-IDF scores
def calculate_tfidf_scores(documents_df):
    # Initializing the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Calculating TF-IDF scores
    tfidf_scores = vectorizer.fit_transform([' '.join(text) for text in documents_df['Processed Text']])

    return tfidf_scores, vectorizer

# Calculating TF-IDF scores
tfidf_scores, vectorizer = calculate_tfidf_scores(documents_df)

# Function to rank documents based on their relevance to a query
def rank_documents(query, documents_df, tfidf_scores, vectorizer):
    # Preprocessing the query
    processed_query = preprocess_text(query)

    # Calculating the TF-IDF scores for the query
    query_tfidf = vectorizer.transform([' '.join(processed_query)])

    # Calculating the cosine similarity between the query and the documents
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_scores).flatten()

    # Getting the indices of the documents sorted by their cosine similarity to the query
    ranked_indices = cosine_similarities.argsort()[::-1]

    # Getting the filenames of the documents sorted by their cosine similarity to the query
    ranked_documents = documents_df['Filename'].iloc[ranked_indices]

    return ranked_documents

# Example usage:
# ranked_documents = rank_documents('example query', documents_df, tfidf_scores, vectorizer)
# print(ranked_documents)

In [10]:
ranked_documents = rank_documents('burial site', documents_df, tfidf_scores, vectorizer)
print(ranked_documents)

48         mcgill_manfredi.txt
31            daily_catlin.txt
37                   mm_01.txt
33                  cbc_02.txt
12                   mm_05.txt
21              ctv_lofaro.txt
1                   cbc_01.txt
29        gazette_petition.txt
17           the_challenge.txt
27        the_stakeholders.txt
14                globe_01.txt
20         city_madocjones.txt
7             globe_amador.txt
13        global_jelowicki.txt
0        tribune_grewal_01.txt
39          gazette_magder.txt
24        global_carpenter.txt
30          tribune_wexler.txt
8                 the_site.txt
22        aptn_fournier_01.txt
19             the_mission.txt
11            cbc_lapierre.txt
32             ed_cable_01.txt
44                   mm_06.txt
41                     faq.txt
10                  cbc_03.txt
4                mm_sep_12.txt
43                timeline.txt
38          tribune_grewal.txt
40                   mm_03.txt
46        aptn_fournier_03.txt
25             mm_petition.txt
15      

# Implement Search Function
Implement a search function that takes a query as input and returns a list of documents sorted by their scores.

In [7]:

# Function to implement search engine
def search(query):
    # Rank documents based on their relevance to the query
    ranked_documents = rank_documents(query, documents_df, tfidf_scores, vectorizer)
    
    # Return the list of documents sorted by their scores
    return list(ranked_documents)

# Example usage:
# search_results = search('example query')
# print(search_results)


In [11]:
search_results = search("burial site")
print(search_results)

['mcgill_manfredi.txt', 'daily_catlin.txt', 'mm_01.txt', 'cbc_02.txt', 'mm_05.txt', 'ctv_lofaro.txt', 'cbc_01.txt', 'gazette_petition.txt', 'the_challenge.txt', 'the_stakeholders.txt', 'globe_01.txt', 'city_madocjones.txt', 'globe_amador.txt', 'global_jelowicki.txt', 'tribune_grewal_01.txt', 'gazette_magder.txt', 'global_carpenter.txt', 'tribune_wexler.txt', 'the_site.txt', 'aptn_fournier_01.txt', 'the_mission.txt', 'cbc_lapierre.txt', 'ed_cable_01.txt', 'mm_06.txt', 'faq.txt', 'cbc_03.txt', 'mm_sep_12.txt', 'timeline.txt', 'tribune_grewal.txt', 'mm_03.txt', 'aptn_fournier_03.txt', 'mm_petition.txt', 'aptn_fournier_02.txt', 'the_site_01.txt', 'gazette_dunlevy.txt', 'city_rubertucci.txt', 'tribune_cason.txt', 'ed_cable.txt', 'global_omalley.txt', 'guiding_principles.txt', 'mm_02.txt', 'gazette_tomesco.txt', 'ed_bonspiel.txt', 'cultural_survival.txt', 'city_henriques.txt', 'the_vision.txt', 'innovation_ecosystem.txt', 'mm_04.txt', 'ctv_harold.txt']


# Display Search Results
Display the search results in a readable format, including the document name and its score.

In [20]:
import yaml

def display_search_results(search_results):
    for rank, filename in enumerate(search_results, start=1):
        # Load document
        with open(os.path.join("data", filename), 'r') as file:
            lines = file.readlines()
        
        # Parse YAML frontmatter
        frontmatter, content_lines = lines[:7], lines[7:]
        metadata = yaml.safe_load("\n".join(frontmatter))
        
        # Join content lines
        content = "\n".join(content_lines)
        
        # Print information
        print(f"Rank: {rank}")
        print(f"Document: {content}")
        print(f"Author: {metadata['author']}")
        print(f"Publisher: {metadata['publisher']}")
        print(f"URL: {metadata['url']}")
        print("--------------------")

In [23]:
import yaml
def display_search_results(search_results):
    for rank, filename in enumerate(search_results, start=1):
        # Load document
        with open(os.path.join("data", filename), 'r') as file:
            lines = file.readlines()
        
        # Remove YAML frontmatter
        content_lines = lines[7:]
        
        # Join content lines
        content = "\n".join(content_lines)
        
        # Print information
        print(f"Rank: {rank}")
        print(f"Document: {content}")
        print(f"Author: {metadata['author']}")
        print(f"Publisher: {metadata['publisher']}")
        print(f"URL: {metadata['url']}")
        print("--------------------")
        
display_search_results(search_results)


Rank: 1
Document: Following up on the last update I provided about the work being conducted on the former Royal Victoria Hospital site, ground penetrating radar work was performed in priority areas of the site, as established  by the Settlement Agreement, a legally binding agreement we entered into last April with the Kanien’keha:ka Kahnistensera, also known as the Mohawk Mothers, and the Société québécoise des infrastructures (SQI), which is responsible for redeveloping the site.

The company conducting the work stated in a report summarizing its findings that no “likely” grave type features was identified across the site.

However, “[i]n total, nine (9) geophysical signatures were identified across the site that display attributes allowing us to categorize them as “potential” grave type features.” 

These nine signatures are all outside the McGill project zone, which accounts for about 15 per cent of the entire site.

The SQI indicated it will follow the recommendation of the Panel o

NameError: name 'metadata' is not defined