In [None]:
!pip install gradio

In [None]:
!pip install whoosh

In [None]:
!pip install spellchecker

In [None]:
!pip install flashtext

In [None]:
!pip install transformers

In [None]:
!pip install nltk

In [None]:
!pip install pyspellchecker

In [None]:
import numpy as np
import os
import re
import shutil
import requests
from bs4 import BeautifulSoup
from spellchecker import SpellChecker
from whoosh import index
from whoosh import scoring
from whoosh.query import Phrase
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import MultifieldParser, PhrasePlugin, FuzzyTermPlugin, QueryParser, OrGroup
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from math import log
import gradio as gr

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
pipeline("text2text-generation", model="t5-small")

In [None]:
analyzer = StemmingAnalyzer()

In [None]:
schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer),
    link=TEXT(stored=True),
    content=TEXT(stored=True, analyzer=analyzer)
)

In [None]:
def preprocess_text(text):
    text = text.lower()
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# **####DOCUMENTS######**

In [None]:
index_dir = "indexdir"

if os.path.exists(index_dir):
    shutil.rmtree(index_dir)

urls = [
    "https://en.wikipedia.org/wiki/Artificial_intelligence",
    "https://en.wikipedia.org/wiki/Artificial_intelligence_in_video_games",
    "https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare",
    "https://en.wikipedia.org/wiki/Artificial_intelligence_arms_race",
    "https://en.wikipedia.org/wiki/Generative_artificial_intelligence",
    "https://en.wikipedia.org/wiki/Ethics_of_artificial_intelligence",
    "https://en.wikipedia.org/wiki/Knowledge_representation_and_reasoning",
    "https://en.wikipedia.org/wiki/Natural_language_processing",
    "https://en.wikipedia.org/wiki/Robotics",
    "https://en.wikipedia.org/wiki/AI_safety",
    "https://en.wikipedia.org/wiki/Cloud_computing",
    "https://en.wikipedia.org/wiki/Cloud-native_computing",
    "https://en.wikipedia.org/wiki/Cloud_computing_security",
    "https://en.wikipedia.org/wiki/History_of_cloud_computing",
    "https://en.wikipedia.org/wiki/Cloud_computing_research",
    "https://en.wikipedia.org/wiki/Cloud-computing_comparison",
    "https://en.wikipedia.org/wiki/Bioinformatics",
    "https://en.wikipedia.org/wiki/Bioinformatics_discovery_of_non-coding_RNAs",
    "https://en.wikipedia.org/wiki/Structural_bioinformatics",
    "https://en.wikipedia.org/wiki/Data_mining",
    "https://en.wikipedia.org/wiki/Educational_data_mining",
    "https://en.wikipedia.org/wiki/Data_stream_mining",
    "https://en.wikipedia.org/wiki/Relational_data_mining",
    "https://en.wikipedia.org/wiki/Text_mining",
    "https://en.wikipedia.org/wiki/Big_data",
    "https://en.wikipedia.org/wiki/Big_data_ethics",
    "https://en.wikipedia.org/wiki/Big_data_maturity_model",
    "https://en.wikipedia.org/wiki/Big_Data_Scoring",
    "https://en.wikipedia.org/wiki/Machine_learning",
    "https://en.wikipedia.org/wiki/Neural_network_(machine_learning)",
    "https://en.wikipedia.org/wiki/Active_learning_(machine_learning)",
    "https://en.wikipedia.org/wiki/Quantum_machine_learning",
    "https://en.wikipedia.org/wiki/Feature_(machine_learning)",
    "https://en.wikipedia.org/wiki/Automated_machine_learning",
    "https://en.wikipedia.org/wiki/Boosting_(machine_learning)",
    "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)",
    "https://en.wikipedia.org/wiki/Machine_learning_in_bioinformatics",
    "https://en.wikipedia.org/wiki/Reinforcement_learning",
    "https://en.wikipedia.org/wiki/Data_science",
    "https://en.wikipedia.org/wiki/Data_analysis",
    "https://en.wikipedia.org/wiki/Social_data_science",
    "https://en.wikipedia.org/wiki/Data_engineering",
    "https://en.wikipedia.org/wiki/Data_Science_and_Predictive_Analytics",
    "https://en.wikipedia.org/wiki/Predictive_analytics",
    "https://en.wikipedia.org/wiki/Learning_analytics",
    "https://en.wikipedia.org/wiki/Cybersecurity_engineering",
    "https://en.wikipedia.org/wiki/Cyber-security_regulation",
    "https://en.wikipedia.org/wiki/Information_security_standards",
    "https://en.wikipedia.org/wiki/Network_security",
    "https://en.wikipedia.org/wiki/National_Cyber_Security_Awareness_Month"
]

def fetch_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.get_text()
        return content
    except Exception as e:
        print(f"Failed to fetch content from {url}: {e}")
        return ""

index_dir = "indexdir"

if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    ix = create_in(index_dir, schema)
else:
    ix = open_dir(index_dir)

articles = [
    {"title": "Artificial Intelligence", "link": "https://en.wikipedia.org/wiki/Artificial_intelligence", "content": fetch_content("https://en.wikipedia.org/wiki/Artificial_intelligence")},
    {"title": "Artificial Intelligence in Video Games", "link": "https://en.wikipedia.org/wiki/Artificial_intelligence_in_video_games", "content": fetch_content("https://en.wikipedia.org/wiki/Artificial_intelligence_in_video_games")},
    {"title": "Artificial Intelligence in Healthcare", "link": "https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare", "content": fetch_content("https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare")},
    {"title": "Artificial Intelligence Arms Race", "link": "https://en.wikipedia.org/wiki/Artificial_intelligence_arms_race", "content": fetch_content("https://en.wikipedia.org/wiki/Artificial_intelligence_arms_race")},
    {"title": "Generative Artificial Intelligence", "link": "https://en.wikipedia.org/wiki/Generative_artificial_intelligence", "content": fetch_content("https://en.wikipedia.org/wiki/Generative_artificial_intelligence")},
    {"title": "Ethics of Artificial Intelligence", "link": "https://en.wikipedia.org/wiki/Ethics_of_artificial_intelligence", "content": fetch_content("https://en.wikipedia.org/wiki/Ethics_of_artificial_intelligence")},
    {"title": "Knowledge Representation and Reasoning", "link": "https://en.wikipedia.org/wiki/Knowledge_representation_and_reasoning", "content": fetch_content("https://en.wikipedia.org/wiki/Knowledge_representation_and_reasoning")},
    {"title": "Natural Language Processing", "link": "https://en.wikipedia.org/wiki/Natural_language_processing", "content": fetch_content("https://en.wikipedia.org/wiki/Natural_language_processing")},
    {"title": "Robotics", "link": "https://en.wikipedia.org/wiki/Robotics", "content": fetch_content("https://en.wikipedia.org/wiki/Robotics")},
    {"title": "AI Safety", "link": "https://en.wikipedia.org/wiki/AI_safety", "content": fetch_content("https://en.wikipedia.org/wiki/AI_safety")},
    {"title": "Cloud Computing", "link": "https://en.wikipedia.org/wiki/Cloud_computing", "content": fetch_content("https://en.wikipedia.org/wiki/Cloud_computing")},
    {"title": "Cloud-Native Computing", "link": "https://en.wikipedia.org/wiki/Cloud-native_computing", "content": fetch_content("https://en.wikipedia.org/wiki/Cloud-native_computing")},
    {"title": "Cloud Computing Security", "link": "https://en.wikipedia.org/wiki/Cloud_computing_security", "content": fetch_content("https://en.wikipedia.org/wiki/Cloud_computing_security")},
    {"title": "History of Cloud Computing", "link": "https://en.wikipedia.org/wiki/History_of_cloud_computing", "content": fetch_content("https://en.wikipedia.org/wiki/History_of_cloud_computing")},
    {"title": "Cloud Computing Research", "link": "https://en.wikipedia.org/wiki/Cloud_computing_research", "content": fetch_content("https://en.wikipedia.org/wiki/Cloud_computing_research")},
    {"title": "Cloud Computing Comparison", "link": "https://en.wikipedia.org/wiki/Cloud-computing_comparison", "content": fetch_content("https://en.wikipedia.org/wiki/Cloud-computing_comparison")},
    {"title": "Bioinformatics", "link": "https://en.wikipedia.org/wiki/Bioinformatics", "content": fetch_content("https://en.wikipedia.org/wiki/Bioinformatics")},
    {"title": "Bioinformatics Discovery of Non-coding RNAs", "link": "https://en.wikipedia.org/wiki/Bioinformatics_discovery_of_non-coding_RNAs", "content": fetch_content("https://en.wikipedia.org/wiki/Bioinformatics_discovery_of_non-coding_RNAs")},
    {"title": "Structural Bioinformatics", "link": "https://en.wikipedia.org/wiki/Structural_bioinformatics", "content": fetch_content("https://en.wikipedia.org/wiki/Structural_bioinformatics")},
    {"title": "Data Mining", "link": "https://en.wikipedia.org/wiki/Data_mining", "content": fetch_content("https://en.wikipedia.org/wiki/Data_mining")},
    {"title": "Educational Data Mining", "link": "https://en.wikipedia.org/wiki/Educational_data_mining", "content": fetch_content("https://en.wikipedia.org/wiki/Educational_data_mining")},
    {"title": "Data Stream Mining", "link": "https://en.wikipedia.org/wiki/Data_stream_mining", "content": fetch_content("https://en.wikipedia.org/wiki/Data_stream_mining")},
    {"title": "Relational Data Mining", "link": "https://en.wikipedia.org/wiki/Relational_data_mining", "content": fetch_content("https://en.wikipedia.org/wiki/Relational_data_mining")},
    {"title": "Text Mining", "link": "https://en.wikipedia.org/wiki/Text_mining", "content": fetch_content("https://en.wikipedia.org/wiki/Text_mining")},
    {"title": "Big Data", "link": "https://en.wikipedia.org/wiki/Big_data", "content": fetch_content("https://en.wikipedia.org/wiki/Big_data")},
    {"title": "Big Data Ethics", "link": "https://en.wikipedia.org/wiki/Big_data_ethics", "content": fetch_content("https://en.wikipedia.org/wiki/Big_data_ethics")},
    {"title": "Big Data Maturity Model", "link": "https://en.wikipedia.org/wiki/Big_data_maturity_model", "content": fetch_content("https://en.wikipedia.org/wiki/Big_data_maturity_model")},
    {"title": "Big Data Scoring", "link": "https://en.wikipedia.org/wiki/Big_Data_Scoring", "content": fetch_content("https://en.wikipedia.org/wiki/Big_Data_Scoring")},
    {"title": "Machine Learning", "link": "https://en.wikipedia.org/wiki/Machine_learning", "content": fetch_content("https://en.wikipedia.org/wiki/Machine_learning")},
    {"title": "Neural Network (Machine Learning)", "link": "https://en.wikipedia.org/wiki/Neural_network_(machine_learning)", "content": fetch_content("https://en.wikipedia.org/wiki/Neural_network_(machine_learning)")},
    {"title": "Active Learning (Machine Learning)", "link": "https://en.wikipedia.org/wiki/Active_learning_(machine_learning)", "content": fetch_content("https://en.wikipedia.org/wiki/Active_learning_(machine_learning)")},
    {"title": "Quantum Machine Learning", "link": "https://en.wikipedia.org/wiki/Quantum_machine_learning", "content": fetch_content("https://en.wikipedia.org/wiki/Quantum_machine_learning")},
    {"title": "Feature (Machine Learning)", "link": "https://en.wikipedia.org/wiki/Feature_(machine_learning)", "content": fetch_content("https://en.wikipedia.org/wiki/Feature_(machine_learning)")},
    {"title": "Automated Machine Learning", "link": "https://en.wikipedia.org/wiki/Automated_machine_learning", "content": fetch_content("https://en.wikipedia.org/wiki/Automated_machine_learning")},
    {"title": "Boosting (Machine Learning)", "link": "https://en.wikipedia.org/wiki/Boosting_(machine_learning)", "content": fetch_content("https://en.wikipedia.org/wiki/Boosting_(machine_learning)")},
    {"title": "Transformer (Deep Learning Architecture)", "link": "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)", "content": fetch_content("https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)")},
    {"title": "Machine Learning in Bioinformatics", "link": "https://en.wikipedia.org/wiki/Machine_learning_in_bioinformatics", "content": fetch_content("https://en.wikipedia.org/wiki/Machine_learning_in_bioinformatics")},
    {"title": "Reinforcement Learning", "link": "https://en.wikipedia.org/wiki/Reinforcement_learning", "content": fetch_content("https://en.wikipedia.org/wiki/Reinforcement_learning")},
    {"title": "Data Science", "link": "https://en.wikipedia.org/wiki/Data_science", "content": fetch_content("https://en.wikipedia.org/wiki/Data_science")},
    {"title": "Data Analysis", "link": "https://en.wikipedia.org/wiki/Data_analysis", "content": fetch_content("https://en.wikipedia.org/wiki/Data_analysis")},
    {"title": "Social Data Science", "link": "https://en.wikipedia.org/wiki/Social_data_science", "content": fetch_content("https://en.wikipedia.org/wiki/Social_data_science")},
    {"title": "Data Engineering", "link": "https://en.wikipedia.org/wiki/Data_engineering", "content": fetch_content("https://en.wikipedia.org/wiki/Data_engineering")},
    {"title": "Data Science and Predictive Analytics", "link": "https://en.wikipedia.org/wiki/Data_Science_and_Predictive_Analytics", "content": fetch_content("https://en.wikipedia.org/wiki/Data_Science_and_Predictive_Analytics")},
    {"title": "Predictive Analytics", "link": "https://en.wikipedia.org/wiki/Predictive_analytics", "content": fetch_content("https://en.wikipedia.org/wiki/Predictive_analytics")},
    {"title": "Learning Analytics", "link": "https://en.wikipedia.org/wiki/Learning_analytics", "content": fetch_content("https://en.wikipedia.org/wiki/Learning_analytics")},
    {"title": "Cybersecurity Engineering", "link": "https://en.wikipedia.org/wiki/Cybersecurity_engineering", "content": fetch_content("https://en.wikipedia.org/wiki/Cybersecurity_engineering")},
    {"title": "Cyber Security Regulation", "link": "https://en.wikipedia.org/wiki/Cyber-security_regulation", "content": fetch_content("https://en.wikipedia.org/wiki/Cyber-security_regulation")},
    {"title": "Information Security Standards", "link": "https://en.wikipedia.org/wiki/Information_security_standards", "content": fetch_content("https://en.wikipedia.org/wiki/Information_security_standards")},
    {"title": "Network Security", "link": "https://en.wikipedia.org/wiki/Network_security", "content": fetch_content("https://en.wikipedia.org/wiki/Network_security")},
    {"title": "National Cyber Security Awareness Month", "link": "https://en.wikipedia.org/wiki/National_Cyber_Security_Awareness_Month", "content": fetch_content("https://en.wikipedia.org/wiki/National_Cyber_Security_Awareness_Month")},
]

writer = ix.writer()

seen_urls = set()
for article in articles:
    if article["link"] not in seen_urls:
        seen_urls.add(article["link"])
        title = preprocess_text(article["link"])
        content = preprocess_text(article["content"])

        writer.add_document(title=article["title"], link=article["link"], content=article["content"])

writer.commit()

# #########BM25##############

In [None]:
def expand_abbreviations(query):
    query = query.lower()
    abbreviations = {"ai": "artificial intelligence",
                     "ml": "machine learning",
                     "nlp": "natural language processing",
                     "iot": "internet of things",
                     "NASA": "national aeronautics and space administration",
                     "etc": "et cetera",
                     "i.e.": "that is",
                     "e.g.": "for example"
                     }
    for abbr, full_form in abbreviations.items():
        query = query.replace(abbr.lower(), full_form.lower())
    return query

def correct_spelling(query):
    query = query.lower()
    spell = SpellChecker()
    words = query.split()
    corrected_words = []

    for word in words:
        corrected_word = spell.correction(word)
        corrected_words.append(corrected_word)

    corrected_query = " ".join(corrected_words)
    return corrected_query

def remove_stop_words(query):
    query = query.lower()
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(query)
    filtered_query = [w for w in word_tokens if w not in stop_words]
    return " ".join(filtered_query)

def retrieve_articles(query, exact_phrase=None):
    results = []

    if not os.path.exists("indexdir"):
        print("Index directory does not exist!")
        return results

    index = open_dir("indexdir")

    corr_query = correct_spelling(query)
    corr_exp_query = expand_abbreviations(corr_query)
    no_stop_query = remove_stop_words(corr_exp_query)
    final_query = preprocess_text(no_stop_query)
    exact_phrase = preprocess_text(exact_phrase) if exact_phrase else None

    with index.searcher(weighting=scoring.BM25F()) as searcher:
        print(f"Number of documents in index: {searcher.doc_count()}")

        parser = MultifieldParser(["title", "content"], schema=index.schema, group=OrGroup)

        try:
            if exact_phrase:
                words = exact_phrase.split()
                query_obj = Phrase("content", words)
            else:
                query_obj = parser.parse(query)

            hits = searcher.search(query_obj, limit=10)

            for hit in hits:
                results.append({
                    "title": hit["title"],
                    "link": hit["link"],
                    "score": hit.score
                })

        except Exception as e:
            print(f"Error during search: {e}")

    return results[:5]

def search_query(query):
    query = query.lower()
    exact_phrase = None

    if '"' in query:
        match = re.search(r'"(.*?)"', query)
        if match:
            exact_phrase = match.group(1)
            query = query.replace(match.group(0), '').strip()
    clean_q = remove_stop_words(query)
    print("clean: ", clean_q)
    clean_exp = expand_abbreviations(clean_q)
    print("clean expanded: ", clean_exp)
    clean_exp_corrected_q = correct_spelling(clean_exp)
    print("clean expanded corrected: ", clean_exp_corrected_q)
    final_query = apply_lemmatization(clean_exp_corrected_q)
    print("final: ", final_query)

    results = retrieve_articles(final_query, exact_phrase=exact_phrase)

    if results:
        return "\n\n".join([f"{res['title']}: {res['link']})" for res in results])
    else:
        return "No results found."


iface = gr.Interface(
    fn=search_query,
    inputs="text",
    outputs="text",
    title="Information Retrieval System",
    description="Enter a query to retrieve relevant articles."
)

iface.launch()

# TF-IDF and COSINE##############

In [None]:
def expand_abbreviations(query):
    abbreviations = {"ai": "artificial intelligence",
                     "ml": "machine learning",
                     "nlp": "natural language processing",
                     "iot": "internet of things",
                     "NASA": "national aeronautics and space administration",
                     "etc": "et cetera",
                     "i.e.": "that is",
                     "e.g.": "for example"
                     }
    for abbr, full_form in abbreviations.items():
        query = query.replace(abbr.lower(), full_form.lower())
    return query

def correct_spelling(query):
    spell = SpellChecker()
    words = query.split()
    corrected_words = []

    for word in words:
        corrected_word = spell.correction(word)
        corrected_words.append(corrected_word)

    corrected_query = " ".join(corrected_words)
    return corrected_query

def remove_stop_words(query):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(query)
    filtered_query = [w for w in word_tokens if w not in stop_words]
    return " ".join(filtered_query)

def apply_lemmatization(query):
    query = query.lower()
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(query)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

def retrieve_articles(query, exact_phrase=None):
    results = []

    if not os.path.exists("indexdir"):
        print("Index directory does not exist!")
        return results

    index = open_dir("indexdir")
    query = query.lower()

    with index.searcher() as searcher:
        print(f"Number of documents in index: {searcher.doc_count()}")

        parser = MultifieldParser(["title", "content"], schema=index.schema)

        try:
            if exact_phrase:
                query_obj = parser.parse(f'("{exact_phrase}" AND {query})')
            else:
                query_obj = parser.parse(query)

            hits = searcher.search(query_obj, limit=10)

            documents = [hit["content"] for hit in hits]
            titles = [hit["title"] for hit in hits]
            links = [hit["link"] for hit in hits]

            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(documents)
            query_vector = vectorizer.transform([query])

            similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

            ranked_indices = np.argsort(similarities)[::-1]

            for idx in ranked_indices[:5]:
                results.append({
                    "title": titles[idx],
                    "link": links[idx],
                    "similarity": similarities[idx]
                })

        except Exception as e:
            print(f"Error during search: {e}")

    return results

def search_query(query):
    exact_phrase = None
    query = query.lower()
    if '"' in query:
        match = re.search(r'"(.*?)"', query)
        if match:
            exact_phrase = match.group(1)
            query = query.replace(match.group(0), '').strip()

    clean_q = remove_stop_words(query)
    print("clean: ", clean_q)
    clean_exp = expand_abbreviations(clean_q)
    print("clean expanded: ", clean_exp)
    clean_exp_corrected_q = correct_spelling(clean_exp)
    print("clean expanded corrected: ", clean_exp_corrected_q)
    final_query = apply_lemmatization(clean_exp_corrected_q)
    print("final: ", final_query)

    results = retrieve_articles(final_query, exact_phrase=exact_phrase)

    if results:
        return "\n\n".join([f"{res['title']}: {res['link']}" for res in results])
    else:
        return "No results found."


iface = gr.Interface(
    fn=search_query,
    inputs="text",
    outputs="text",
    title="Information Retrieval System",
    description="Enter a query to retrieve relevant articles."
)

iface.launch()