In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Part 1: Creating a Database of Documents

In [2]:
#Use Beautifulsoup and requests to scrape the documents from different websites
#I decided to scrape information from wikipedia about different school subjects
urls = ["https://en.wikipedia.org/wiki/Mathematics",
        "https://en.wikipedia.org/wiki/Chemistry",
        "https://en.wikipedia.org/wiki/Physics",
        "https://en.wikipedia.org/wiki/Biology",
        "https://en.wikipedia.org/wiki/Geography",
        "https://en.wikipedia.org/wiki/Geology",
        "https://en.wikipedia.org/wiki/Astronomy",
        "https://en.wikipedia.org/wiki/Archaeology",
        "https://en.wikipedia.org/wiki/Ecology",
        "https://en.wikipedia.org/wiki/Climatology",
        "https://en.wikipedia.org/wiki/Physical_education",
        "https://en.wikipedia.org/wiki/Art",
        "https://en.wikipedia.org/wiki/Drama",
        "https://en.wikipedia.org/wiki/Music",
        "https://en.wikipedia.org/wiki/Language",
        "https://en.wikipedia.org/wiki/Social_studies",
        "https://en.wikipedia.org/wiki/Foreign_language"]

In [3]:
def fetch(url):
    response = requests.get(url)
    return response.text

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = text.translate(str.maketrans("", "", string.punctuation + string.digits))

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

def scrape_website(url):
    html = fetch(url)
    soup = BeautifulSoup(html, "html.parser")
    # Parse and extract data from the website using BeautifulSoup
    title = soup.title.string.strip()
    content = soup.find("div", {"id": "mw-content-text"}).get_text()

    # Preprocess the content
    content = preprocess_text(content)

    # Save the scraped data to a file
    subject_name = url.split("/")[-1]
    file_name = f"{subject_name}.txt"

    with open(file_name, "w", encoding="utf-8") as file:
        file.write(f"URL: {url}\nTitle: {title}\n\n{content}")

def main():
    for url in urls:
        scrape_website(url)

if __name__ == "__main__":
    main()

# Part 2: Creating a Search Engine

### Using Inverted Indexing

In [4]:
# Directory where the documents are saved
documents_dir = "C:\\Users\\Lenovo\\Desktop\\MSBA\\3 Summer 2023\\Text Analytics and Natural Language Processing MSBA316\\Assignments\\Assignment 3 - IR and Search Engines\\Documents"

In [5]:
def create_inverted_index(documents_dir):
    inverted_index = {}
    document_ids = {}

    for root, _, files in os.walk(documents_dir):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()

            document_id = file_path.split("/")[-1].split(".")[0]
            document_ids[document_id] = file_path

            preprocessed_content = preprocess_text(content)
            terms = word_tokenize(preprocessed_content)

            # Build the inverted index
            term_frequency = {}
            for term in terms:
                if term in term_frequency:
                    term_frequency[term] += 1
                else:
                    term_frequency[term] = 1

            for term, freq in term_frequency.items():
                if term in inverted_index:
                    inverted_index[term].append((document_id, freq))
                else:
                    inverted_index[term] = [(document_id, freq)]

    return inverted_index, document_ids

In [6]:
inverted_index, document_ids = create_inverted_index(documents_dir)

In [7]:
def inverted_index_to_dataframe(inverted_index):
    rows = []
    for term, posting_list in inverted_index.items():
        for doc_id, freq in posting_list:
            rows.append([term, doc_id, freq])
    
    # Create a DataFrame from the list of rows
    df = pd.DataFrame(rows, columns=["Term", "Document ID", "Term Frequency"])
    return df

# Convert the inverted index to a DataFrame
inverted_index_df = inverted_index_to_dataframe(inverted_index)

inverted_index_df

Unnamed: 0,Term,Document ID,Term Frequency
0,url,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
1,url,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
2,url,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,2
3,url,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,2
4,url,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
...,...,...,...
39300,holston,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
39301,wwwteachingcertificationcom,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
39302,gabby,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1
39303,civicedorg,C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Tex...,1


In [8]:
def intersect_posting(ii1: list, ii2: list):

    a1 = [x[0] for x in ii1]
    a2 = [x[0] for x in ii2]
    answers = []
    i = 0
    j = 0
    while i < len(a1) and j < len(a2):
        if a1[i] == a2[j]:
            answers.append((a1[i], sum([ii1[i][1], ii2[j][1]])))
            i += 1
            j += 1
            i += 1
        else:
            j += 1
    return answers

def conjunctive_search(query_terms):
    if not query_terms:
        return []

    # Sort the query terms by document frequency (descending order)
    query_terms.sort(key=lambda term: len(inverted_index.get(term, [])), reverse=True)

    # Initialize the result with the posting list of the first query term
    results = inverted_index.get(query_terms[0], [])

    # Intersect the posting lists of all query terms
    for term in query_terms[1:]:
        posting_list = inverted_index.get(term, [])
        results = intersect_posting(results, posting_list)

    # Sort the results by term frequency (descending order)
    results.sort(key=lambda entry: entry[1], reverse=True)

    # Map document IDs to their paths for display purposes
    sorted_results = [(document_ids[doc_id], freq) for doc_id, freq in results]
    return sorted_results

In [9]:
# Example query terms
query_terms = ["fitness"]
search_results = conjunctive_search(query_terms)

for result in search_results:
    print("Document Path:", result[0])
    print("Term Frequency:", result[1])
    print("=" * 120)

Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Physical_education.txt
Term Frequency: 9
Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Ecology.txt
Term Frequency: 6
Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Mathematics.txt
Term Frequency: 2
Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Art.txt
Term Frequency: 1


In [10]:
# Example query terms
query_terms = ["art", "music"]
search_results = conjunctive_search(query_terms)

for result in search_results:
    print("Document Path:", result[0])
    print("Term Frequency:", result[1])
    print("=" * 120)

Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Archaeology.txt
Term Frequency: 21


In [20]:
# Example query terms
query_terms = ["math", "theorem", "finite", "infinite", "einstein"]
search_results = conjunctive_search(query_terms)

for result in search_results:
    print("Document Path:", result[0])
    print("Term Frequency:", result[1])
    print("=" * 120)

Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Mathematics.txt
Term Frequency: 60


In [21]:
# Example query terms
query_terms = ["telescope", "solar", "galaxy", "hole"]
search_results = conjunctive_search(query_terms)

for result in search_results:
    print("Document Path:", result[0])
    print("Term Frequency:", result[1])
    print("=" * 120)

Document Path: C:\Users\Lenovo\Desktop\MSBA\3 Summer 2023\Text Analytics and Natural Language Processing MSBA316\Assignments\Assignment 3 - IR and Search Engines\Documents\Astronomy.txt
Term Frequency: 135


### Using TF-IDF and Cosine Similarity

In [22]:
# Create a tfidf matrix
def build_tfidf_matrix(docs):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs)
    return tfidf_matrix, vectorizer

# Calculate cosine similarity between the query and each document using tfidf weights
def search(query, tfidf_matrix, vectorizer):
    preprocessed_query = preprocess_text(query)
    query_vector = vectorizer.transform([preprocessed_query])

    # Calculate cosine similarity between the query and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

    # Get the indices of documents in descending order of similarity
    ranked_indices = similarity_scores.argsort()[0][::-1]

    return ranked_indices

In [24]:
def main():
    # Directory containing the preprocessed documents
    documents_dir = "C:\\Users\\Lenovo\\Desktop\\MSBA\\3 Summer 2023\\Text Analytics and Natural Language Processing MSBA316\\Assignments\\Assignment 3 - IR and Search Engines\\Documents"

    # Get all files in the directory
    text_files = os.listdir(documents_dir)
    
    # Load the preprocessed documents into a list
    documents = []
    for file in text_files:
        file_path = os.path.join(documents_dir, file)
        with open(file_path, "r", encoding="utf-8") as file:
            documents.append(file.read())

    # Build the TF-IDF matrix and vectorizer
    tfidf_matrix, vectorizer = build_tfidf_matrix(documents)
    
    while True:
        search_query = input("Enter your search query (type 'exit' to quit): ")
        if search_query.lower() == "exit":
            break

        ranked_indices = search(search_query, tfidf_matrix, vectorizer)

        print("\nSearch Results:")
        for idx in ranked_indices:
            file_name = text_files[idx]
            file_path = os.path.join(documents_dir, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                content_lines = file.readlines()
                title = content_lines[1].replace('Title:', '').strip()
                url = content_lines[0].replace('URL:', '').strip()
                print(f"Title: {title}")
                print(f"URL: {url}")

                # Get the cosine similarity between the search query vector and the document vector
                query_vector = vectorizer.transform([search_query])
                doc_vector = tfidf_matrix[idx]
                similarity = cosine_similarity(query_vector, doc_vector)[0][0]

                print(f"Cosine Similarity: {similarity:.4f}")
                print("=" * 80)
        print("\n\n")

if __name__ == "__main__":
    main()

Enter your search query (type 'exit' to quit): newton, apple, gravity

Search Results:
Title: Physics - Wikipedia
URL: https://en.wikipedia.org/wiki/Physics
Cosine Similarity: 0.0274
Title: Mathematics - Wikipedia
URL: https://en.wikipedia.org/wiki/Mathematics
Cosine Similarity: 0.0105
Title: Geology - Wikipedia
URL: https://en.wikipedia.org/wiki/Geology
Cosine Similarity: 0.0073
Title: Archaeology - Wikipedia
URL: https://en.wikipedia.org/wiki/Archaeology
Cosine Similarity: 0.0056
Title: Astronomy - Wikipedia
URL: https://en.wikipedia.org/wiki/Astronomy
Cosine Similarity: 0.0052
Title: Ecology - Wikipedia
URL: https://en.wikipedia.org/wiki/Ecology
Cosine Similarity: 0.0035
Title: Art - Wikipedia
URL: https://en.wikipedia.org/wiki/Art
Cosine Similarity: 0.0022
Title: Language - Wikipedia
URL: https://en.wikipedia.org/wiki/Language
Cosine Similarity: 0.0015
Title: Chemistry - Wikipedia
URL: https://en.wikipedia.org/wiki/Chemistry
Cosine Similarity: 0.0015
Title: Climatology - Wikipedia


Enter your search query (type 'exit' to quit): second language, english, arabic

Search Results:
Title: Language - Wikipedia
URL: https://en.wikipedia.org/wiki/Language
Cosine Similarity: 0.4299
Title: Foreign language - Wikipedia
URL: https://en.wikipedia.org/wiki/Foreign_language
Cosine Similarity: 0.4275
Title: Drama - Wikipedia
URL: https://en.wikipedia.org/wiki/Drama
Cosine Similarity: 0.0359
Title: Chemistry - Wikipedia
URL: https://en.wikipedia.org/wiki/Chemistry
Cosine Similarity: 0.0187
Title: Mathematics - Wikipedia
URL: https://en.wikipedia.org/wiki/Mathematics
Cosine Similarity: 0.0186
Title: Astronomy - Wikipedia
URL: https://en.wikipedia.org/wiki/Astronomy
Cosine Similarity: 0.0143
Title: Archaeology - Wikipedia
URL: https://en.wikipedia.org/wiki/Archaeology
Cosine Similarity: 0.0142
Title: Art - Wikipedia
URL: https://en.wikipedia.org/wiki/Art
Cosine Similarity: 0.0127
Title: Geography - Wikipedia
URL: https://en.wikipedia.org/wiki/Geography
Cosine Similarity: 0.0122
Tit

Enter your search query (type 'exit' to quit): notes, song, instrument

Search Results:
Title: Music - Wikipedia
URL: https://en.wikipedia.org/wiki/Music
Cosine Similarity: 0.1336
Title: Archaeology - Wikipedia
URL: https://en.wikipedia.org/wiki/Archaeology
Cosine Similarity: 0.0157
Title: Foreign language - Wikipedia
URL: https://en.wikipedia.org/wiki/Foreign_language
Cosine Similarity: 0.0058
Title: Drama - Wikipedia
URL: https://en.wikipedia.org/wiki/Drama
Cosine Similarity: 0.0123
Title: Art - Wikipedia
URL: https://en.wikipedia.org/wiki/Art
Cosine Similarity: 0.0061
Title: Language - Wikipedia
URL: https://en.wikipedia.org/wiki/Language
Cosine Similarity: 0.0040
Title: Mathematics - Wikipedia
URL: https://en.wikipedia.org/wiki/Mathematics
Cosine Similarity: 0.0000
Title: Astronomy - Wikipedia
URL: https://en.wikipedia.org/wiki/Astronomy
Cosine Similarity: 0.0036
Title: Chemistry - Wikipedia
URL: https://en.wikipedia.org/wiki/Chemistry
Cosine Similarity: 0.0000
Title: Geography - W

Enter your search query (type 'exit' to quit): telescope, space, solar system, stars

Search Results:
Title: Astronomy - Wikipedia
URL: https://en.wikipedia.org/wiki/Astronomy
Cosine Similarity: 0.1678
Title: Physics - Wikipedia
URL: https://en.wikipedia.org/wiki/Physics
Cosine Similarity: 0.0342
Title: Geography - Wikipedia
URL: https://en.wikipedia.org/wiki/Geography
Cosine Similarity: 0.0313
Title: Geology - Wikipedia
URL: https://en.wikipedia.org/wiki/Geology
Cosine Similarity: 0.0260
Title: Ecology - Wikipedia
URL: https://en.wikipedia.org/wiki/Ecology
Cosine Similarity: 0.0211
Title: Climatology - Wikipedia
URL: https://en.wikipedia.org/wiki/Climatology
Cosine Similarity: 0.0179
Title: Language - Wikipedia
URL: https://en.wikipedia.org/wiki/Language
Cosine Similarity: 0.0168
Title: Chemistry - Wikipedia
URL: https://en.wikipedia.org/wiki/Chemistry
Cosine Similarity: 0.0099
Title: Biology - Wikipedia
URL: https://en.wikipedia.org/wiki/Biology
Cosine Similarity: 0.0098
Title: Socia