# Search Engine

This Jupyter Notebook contains a search engine program. The program allows you to search for articles based on an Arxiv keyword, a subreddit, and general keywords. 

To use the search engine, follow these steps:
1. Fill in the three labels with your desired values for the Arxiv keyword, subreddit, and general keyword.
2. Click the "Search" button.
3. The program will display 5 results, including the article URL, a similarity score with your keywords, and an overview of the article.

Please note that this program requires the necessary modules and configurations to be set up correctly. Go to README.md and know all the pre requirements before running the program.

Happy searching! :rocket:

In [1]:
# Imports

import logging
import os
import config   
import pandas as pd
from ipywidgets import widgets
import ipywidgets as widgets
from IPython.display import display
from modules.corpus import Corpus
from utils.program import full_search_engine_proc
from utils.tools import clean_text_util


In [2]:
# Application set up

path = os.path.dirname(os.getcwd())
data_path = os.path.join(path, 'data')
collection = list()

def run_process(arxiv_kw:str, subreddit:str, keywords:str) -> pd.DataFrame:
    tokens_kw = clean_text_util(keywords)
    try:
        corpus = full_search_engine_proc(arxiv_kw=arxiv_kw, subreddit_kw=subreddit)
    except TypeError as t:
        logging.error(t)
        raise TypeError
    try:
        # corpus = Corpus()

        # max_articles = 10
        # for i in range(max_articles):
        #     doc = collection[i]
        #     corpus.add(author=doc.author , doc=doc)
                
        vocab = corpus.get_stats()
        vocab = vocab.sort_values('count', ascending=False)
                    
        from utils.program import search_engine
        results = search_engine(corpus.get_all_docs(), tokens_kw)
        
    except TypeError as t:
        logging.error(t)
        raise TypeError
    except ValueError as v:
        logging.error(v)
        raise ValueError

    return pd.DataFrame(results)


In [3]:
# Form set up

subreddit_label = widgets.Label(value='Subreddit:')
subreddit = widgets.Text(value='MachineLearning')
display(subreddit_label)
display(subreddit)

arxiv_kw_label = widgets.Label(value='Arxiv Keyword:')
arxiv_kw = widgets.Text(value='machine learning')
display(arxiv_kw_label)
display(arxiv_kw)

keywords_label = widgets.Label(value='Keywords:')
keywords = widgets.Text(value='Llama machine learning')
display(keywords_label)
display(keywords)

# Create the button widget
button = widgets.Button(description="Search")

# Function to run when button is clicked
def pass_values_to_button(button):
    button.subreddit_kw = subreddit.value
    button.arxiv_kw = arxiv_kw.value
    button.keywords = keywords.value
    result = run_process(
        arxiv_kw=button.arxiv_kw,
        subreddit=button.subreddit_kw, 
        keywords=button.keywords
        )
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    # Show first 5 results
    print(result.iloc[:5].to_string())

# Associate the function with the button's on_click event
button.on_click(pass_values_to_button)

# Display the button
display(button)

Label(value='Subreddit:')

Text(value='MachineLearning')

Label(value='Arxiv Keyword:')

Text(value='machine learning')

Label(value='Keywords:')

Text(value='Llama machine learning')

Button(description='Search', style=ButtonStyle())

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

ERROR:root:'Corpus' object is not iterable


TypeError: 