Version #1

In [None]:
import requests
import re
from collections import defaultdict
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import PorterStemmer

nltk.download('wordnet')
nltk.download('stopwords')

class Crawler:
    def __init__(self):
        self.to_visit = []
        self.visited = set()
        self.inverted_index = defaultdict(list)
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fetch(self, url):
        print(f'Now fetching: {url}')
        try:
            res = requests.get(url).content
            return res
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return b''

    def get_current_url(self):
        if not self.to_visit:
            return None
        res = self.to_visit.pop(0)
        while res in self.visited:
            if not self.to_visit:
                return None
            res = self.to_visit.pop(0)
        return res

    def get_links(self, content):
        urls = re.findall(r'<a href="([^"]+)"', str(content))
        for url in urls:
            if re.match(r'^http', url) and url not in self.visited and url not in self.to_visit:
                self.to_visit.append(url)

    def get_paragraphs(self, content):
        return re.findall(r'<p>(.*?)</p>', str(content), re.DOTALL)

    def preprocess_text(self, text):

        text = re.sub(r'[^\w\s]', '', text.lower())

        words = [word for word in text.split() if word not in self.stop_words]

        stemmed_words = [self.stemmer.stem(word) for word in words]
        return stemmed_words

    def update_inverted_index(self, content, url):
        words = self.preprocess_text(content.decode('utf-8'))
        for word in set(words):
            self.inverted_index[word].append(url)

    def search_word_in_index(self, word):
        word = self.stemmer.stem(word.lower())
        found_urls = self.inverted_index.get(word, [])
        print(f'Word "{word}" found in: {found_urls}')
        return found_urls

    def search_words_and_in_index(self, word1, word2):
        urls_word1 = set(self.search_word_in_index(word1))
        urls_word2 = set(self.search_word_in_index(word2))
        common_urls = urls_word1.intersection(urls_word2)
        print(f'Words "{word1}" and "{word2}" found together in: {list(common_urls)}')
        return list(common_urls)

    def search_words_or_in_index(self, word1, word2):
        urls_word1 = set(self.search_word_in_index(word1))
        urls_word2 = set(self.search_word_in_index(word2))
        all_urls = urls_word1.union(urls_word2)
        print(f'Word "{word1}" or "{word2}" found in: {list(all_urls)}')
        return list(all_urls)

    def search_phrase_exclude_word_in_index(self, phrase, exclude_word):
        urls_with_phrase = set(self.search_word_in_index(phrase))
        urls_with_exclude_word = set(self.search_word_in_index(exclude_word))
        final_urls = urls_with_phrase.difference(urls_with_exclude_word)
        print(f'Phrase "{phrase}" found without "{exclude_word}" in: {list(final_urls)}')
        return list(final_urls)

    def crawl(self, url, depth=15):
        self.to_visit.append(url)
        while len(self.visited) < depth and self.to_visit:
            current_url = self.get_current_url()
            if current_url is None:
                break

            content = self.fetch(current_url)
            self.visited.add(current_url)
            self.get_links(content)

            paragraphs = self.get_paragraphs(content)
            for paragraph in paragraphs:
                self.update_inverted_index(paragraph.encode('utf-8'), current_url)

        print('Visited URLs:', len(self.visited))


crawler = Crawler()
root_url = 'https://www.python.org'

print(f'\n--- Starting crawl for site: {root_url} ---')
crawler.crawl(root_url, depth=15)
print(f'--- Finished processing site: {root_url} ---')


print("\n--- Search Examples ---")
crawler.search_word_in_index("Python")
crawler.search_words_and_in_index("Python", "programming")
crawler.search_words_or_in_index("Python", "Java")
crawler.search_phrase_exclude_word_in_index("programming", "Java")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



--- Starting crawl for site: https://www.python.org ---
Now fetching: https://www.python.org
Now fetching: http://browsehappy.com/
Now fetching: https://www.python.org/psf/
Now fetching: https://docs.python.org
Now fetching: https://pypi.org/
Now fetching: https://psfmember.org/civicrm/contribute/transact?reset=1&id=2
Now fetching: https://www.linkedin.com/company/python-software-foundation/
Now fetching: https://fosstodon.org/@ThePSF
Now fetching: https://twitter.com/ThePSF
Now fetching: http://brochure.getpython.info/
Now fetching: https://docs.python.org/3/license.html
Now fetching: https://wiki.python.org/moin/BeginnersGuide
Now fetching: https://devguide.python.org/
Now fetching: https://docs.python.org/faq/
Now fetching: http://wiki.python.org/moin/Languages
Visited URLs: 15
--- Finished processing site: https://www.python.org ---

--- Search Examples ---
Word "python" found in: ['https://www.python.org', 'https://www.python.org', 'https://www.python.org', 'https://www.python.or

['https://www.python.org/psf/',
 'https://www.python.org',
 'https://psfmember.org/civicrm/contribute/transact?reset=1&id=2']

Version #2



تم تطوير عمليات إدخال

 عمل  لعرض النتائج وعمليات البحث البدائية demo function

In [None]:
import requests
import re
from collections import defaultdict
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

class AdvancedCrawler:
    def __init__(self):
        self.to_visit = []
        self.visited = set()
        self.inverted_index = defaultdict(list)
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def fetch(self, url):
        """Fetch content from a URL."""
        try:
            res = requests.get(url, timeout=10)
            if res.status_code == 200:
                return res.content
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
        return b''

    def parse_links(self, content, base_url):
        """Extract and normalize links from HTML content."""
        links = re.findall(r'<a href="([^"]+)"', str(content))
        for link in links:
            if link.startswith('/'):
                link = base_url + link
            if link.startswith('http') and link not in self.visited:
                self.to_visit.append(link)

    def tokenize_and_preprocess(self, text):
        """Tokenize and preprocess text."""
        tokens = word_tokenize(text.lower())
        tokens = [self.stemmer.stem(word) for word in tokens if word.isalnum() and word not in self.stop_words]
        return tokens

    def update_inverted_index(self, content, url):
        """Update the inverted index with preprocessed tokens."""
        text = content.decode('utf-8', errors='ignore')
        tokens = self.tokenize_and_preprocess(text)
        for token in set(tokens):
            self.inverted_index[token].append(url)

    def search_word_in_index(self, word):
        """Search for a single word in the inverted index."""
        word = self.stemmer.stem(word.lower())
        return self.inverted_index.get(word, [])

    def search_phrase_exclude_word_in_index(self, phrase, exclude_word):
        """Search for a phrase excluding a specific word."""
        phrase_urls = set(self.search_word_in_index(phrase))
        exclude_urls = set(self.search_word_in_index(exclude_word))
        return list(phrase_urls - exclude_urls)

    def crawl(self, root_url, depth):
        """Perform crawling starting from a root URL up to a specific depth."""
        self.to_visit.append(root_url)
        while len(self.visited) < depth and self.to_visit:
            current_url = self.to_visit.pop(0)
            if current_url in self.visited:
                continue

            content = self.fetch(current_url)
            if content:
                self.visited.add(current_url)
                self.parse_links(content, root_url)
                self.update_inverted_index(content, current_url)

            time.sleep(1)  # Politeness delay

    def run_demo(self):
        """Run a search demo."""
        query = input("Enter search query: ")
        exclude = input("Enter word to exclude (optional): ").strip()
        if exclude:
            results = self.search_phrase_exclude_word_in_index(query, exclude)
        else:
            results = self.search_word_in_index(query)

        if results:
            print(f"Found results in the following URLs: {results}")
        else:
            print("No results found.")

if __name__ == "__main__":
    root_url = input("Enter the root URL to start crawling: ")
    depth = int(input("Enter the crawl depth: "))
    crawler = AdvancedCrawler()
    crawler.crawl(root_url, depth)
    print(f"Crawl complete. Indexed {len(crawler.visited)} pages.")
    crawler.run_demo()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the root URL to start crawling: https://www.bemyeyes.com/
Enter the crawl depth: 15
Crawl complete. Indexed 15 pages.
Enter search query: solutions
Enter word to exclude (optional): 
Found results in the following URLs: ['https://www.bemyeyes.com/', 'https://www.bemyeyes.com/blog/be-my-eyes-meta-accessibility-partnership', 'https://www.bemyeyes.com//', 'https://www.bemyeyes.com//solutions', 'https://www.bemyeyes.com//accessible-customer-service', 'https://www.bemyeyes.com//customers', 'https://www.bemyeyes.com//corporate-volunteering', 'https://www.bemyeyes.com//be-my-eyes-for-work', 'https://www.bemyeyes.com//products', 'https://www.bemyeyes.com//mobile-app', 'https://www.bemyeyes.com//be-my-eyes-for-windows', 'https://www.bemyeyes.com//ray-ban-meta', 'https://www.bemyeyes.com//accessible-service-suite', 'https://www.bemyeyes.com//service-directory', 'https://www.bemyeyes.com//service-ai']


Version #3


Final Version

تم تطوير عمليات البحث بشكل كامل

تم إضافة and ,or

تم إضافة methods

1- display reults لعرض النتائج عمودياً


2-save results


In [None]:
#mid project final version
import requests
import re
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

class Crawler:
    def __init__(self):
        self.to_visit = []
        self.visited = set()
        self.inverted_index = defaultdict(list)
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

# Fetch content from a URL
    def fetch(self, url):

        try:
            res = requests.get(url)
            if res.status_code == 200:
                return res.content
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
        return b''

    def get_links(self, content, base_url):
        links = re.findall(r'<a href="([^"]+)"', str(content))
        for link in links:
            if link.startswith('/'):
                link = base_url + link
            if link.startswith('http') and link not in self.visited:
                self.to_visit.append(link)

    def tokenize_and_preprocess(self, text):
        tokens = word_tokenize(text.lower())
        tokens = [self.stemmer.stem(word) for word in tokens if word.isalnum() and word not in self.stop_words]
        return tokens


    def update_inverted_index(self, content, url):

        text = content.decode('utf-8', errors='ignore')
        tokens = self.tokenize_and_preprocess(text)
        for token in set(tokens):
            self.inverted_index[token].append(url)


    def search_word_in_index(self, word):

        word = self.stemmer.stem(word.lower())
        return self.inverted_index.get(word, [])

    def search_words_and_in_index(self, word1, word2):

        urls_word1 = set(self.search_word_in_index(word1))
        urls_word2 = set(self.search_word_in_index(word2))
        common_urls = urls_word1.intersection(urls_word2)
        return list(common_urls)

    def search_words_or_in_index(self, word1, word2):

        urls_word1 = set(self.search_word_in_index(word1))
        urls_word2 = set(self.search_word_in_index(word2))
        all_urls = urls_word1.union(urls_word2)
        return list(all_urls)

    def search_phrase_exclude_word_in_index(self, phrase, exclude_word):

        phrase_urls = set(self.search_word_in_index(phrase))
        exclude_urls = set(self.search_word_in_index(exclude_word))
        return list(phrase_urls - exclude_urls)

    def display_results(self, results):

        if results:
            print("\n--- Results ---")
            for idx, url in enumerate(results, 1):
                print(f"{idx}. {url}")
        else:
            print("\n No results found.")

    def save_results(self, results):
        filepath = input("Enter the file name to save the results (e.g., results.txt): ")
        with open(filepath, 'w') as file:
            for url in results:
                file.write(url + '\n')
        print(f"Results have been saved in {filepath}")


    def crawl(self, root_url, depth):
        self.to_visit.append(root_url)
        while len(self.visited) < depth and self.to_visit:
            current_url = self.to_visit.pop(0)

            content = self.fetch(current_url)
            if content:
                self.visited.add(current_url)
                self.get_links(content, root_url)
                self.update_inverted_index(content, current_url)


    def run_demo(self):
        print("\n--- Search Options ---")
        print("1. Search for a single word.")
        print("2. Search for two words with AND condition.")
        print("3. Search for two words with OR condition.")
        print("4. Search for a phrase excluding a specific word.")

        choice = int(input("Enter the search option (1/2/3/4): "))
        if choice == 1:
            word = input("Enter the word to search: ")
            results = self.search_word_in_index(word)
        elif choice == 2:
            word1 = input("Enter the first word: ")
            word2 = input("Enter the second word: ")
            results = self.search_words_and_in_index(word1, word2)
        elif choice == 3:
            word1 = input("Enter the first word: ")
            word2 = input("Enter the second word: ")
            results = self.search_words_or_in_index(word1, word2)
        elif choice == 4:
            phrase = input("Enter the phrase: ")
            exclude_word = input("Enter the word to exclude: ")
            results = self.search_phrase_exclude_word_in_index(phrase, exclude_word)
        else:
            print("Invalid choice.")
            return

        if results:
            print(f"Found results in the following URLs: {results}")
        else:
            print("No results found.")

        self.display_results(results)

        if results:
            save = input("Do you want to save reults (y/n): ").lower()
            if save == 'y':
                self.save_results(results)

if __name__ == "__main__":
    root_url = input("Enter the root URL to start crawling: ")
    depth = int(input("Enter the crawl depth: "))
    crawler = Crawler()
    crawler.crawl(root_url, depth)
    print(f"Crawl complete. Indexed {len(crawler.visited)} pages.")
    crawler.run_demo()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Enter the root URL to start crawling: https://www.wikipedia.org/
Enter the crawl depth: 5
Crawl complete. Indexed 5 pages.

--- Search Options ---
1. Search for a single word.
2. Search for two words with AND condition.
3. Search for two words with OR condition.
4. Search for a phrase excluding a specific word.
Enter the search option (1/2/3/4): 1
Enter the word to search: search
Found results in the following URLs: ['https://www.wikipedia.org/', 'https://www.wikipedia.org///zh.wikipedia.org/', 'https://www.wikipedia.org///zh-min-nan.wikipedia.org/', 'https://www.wikipedia.org///sr.wikipedia.org/', 'https://www.wikipedia.org///zh-yue.wikipedia.org/']

--- Results ---
1. https://www.wikipedia.org/
2. https://www.wikipedia.org///zh.wikipedia.org/
3. https://www.wikipedia.org///zh-min-nan.wikipedia.org/
4. https://www.wikipedia.org///sr.wikipedia.org/
5. https://www.wikipedia.org///zh-yue.wikipedia.org/
Do you want to save reults (y/n): n


In [None]:
  !pip install nltk




Final project IRS


In [15]:
import requests
import re
from collections import defaultdict
import math
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download

# Download necessary NLTK data
download('punkt')
nltk.download ('punkt_tab')
download('stopwords')

class SearchEngine:
    def __init__(self):
        self.to_visit = []
        self.visited = set()
        self.inverted_index = defaultdict(list)
        self.tfidf_index = defaultdict(lambda: defaultdict(float))
        self.positional_index = defaultdict(lambda: defaultdict(list))
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.document_lengths = {}

    def fetch(self, url):
        try:
            res = requests.get(url)
            if res.status_code == 200:
                return res.content
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
        return b''

    def get_links(self, content, base_url):
        links = re.findall(r'<a href="([^"]+)"', str(content))
        for link in links:
            if link.startswith('/'):
                link = base_url + link
            if link.startswith('http') and link not in self.visited:
                self.to_visit.append(link)

    def tokenize_and_preprocess(self, text):
        tokens = word_tokenize(text.lower())
        return [
            self.stemmer.stem(word)
            for word in tokens if word.isalnum() and word not in self.stop_words
        ]

    def update_indexes(self, content, url):
        text = content.decode('utf-8', errors='ignore')
        tokens = self.tokenize_and_preprocess(text)
        self.document_lengths[url] = len(tokens)

        # Positional Index
        for position, token in enumerate(tokens):
            self.positional_index[token][url].append(position)

        # Inverted Index
        for token in set(tokens):
            self.inverted_index[token].append(url)

        # TF-IDF Index
        token_counts = defaultdict(int)
        for token in tokens:
            token_counts[token] += 1
        for token, count in token_counts.items():
            tf = count / len(tokens)
            self.tfidf_index[token][url] = tf

    def compute_idf(self):
        total_docs = len(self.visited)
        for token, doc_dict in self.tfidf_index.items():
            idf = math.log(total_docs / len(doc_dict))
            for url in doc_dict:
                self.tfidf_index[token][url] *= idf

    def search_word_tfidf(self, word):
        word = self.stemmer.stem(word.lower())
        return sorted(
            self.tfidf_index.get(word, {}).items(),
            key=lambda x: x[1],
            reverse=True
        )

    def search_phrase(self, phrase):
        tokens = self.tokenize_and_preprocess(phrase)
        if not tokens:
            return []

        result_urls = set(self.positional_index[tokens[0]])
        for token in tokens[1:]:
            result_urls.intersection_update(self.positional_index[token])

        phrase_results = []
        for url in result_urls:
            positions = [
                self.positional_index[token][url] for token in tokens
            ]
            for start_pos in positions[0]:
                if all((start_pos + i) in positions[i] for i in range(1, len(tokens))):
                    phrase_results.append(url)
                    break
        return phrase_results

    def display_positional_index(self):
        print("\n--- Positional Inverted Index ---")
        for term, doc_positions in self.positional_index.items():
            print(f"Term: {term}")
            for doc, positions in doc_positions.items():
                print(f"  Document: {doc}, Positions: {positions}")

    def crawl(self, root_url, depth):
        self.to_visit.append(root_url)
        while len(self.visited) < depth and self.to_visit:
            current_url = self.to_visit.pop(0)
            if current_url in self.visited:
                continue
            content = self.fetch(current_url)
            if content:
                self.visited.add(current_url)
                self.get_links(content, root_url)
                self.update_indexes(content, current_url)
        self.compute_idf()

    def run_demo(self):
        print("\n--- Search Options ---")
        print("1. Search for a single word (TF-IDF based).")
        print("2. Search for a phrase (using Positional Inverted Index).")
        choice = int(input("Enter your choice (1/2): "))
        if choice == 1:
            word = input("Enter the word to search: ")
            results = self.search_word_tfidf(word)
            if results:
                print("\n--- TF-IDF Search Results ---")
                for idx, (url, score) in enumerate(results, 1):
                    print(f"{idx}. {url} (TF-IDF Score: {score:.4f})")
            else:
                print("\nNo results found for the word.")
        elif choice == 2:
            phrase = input("Enter the phrase to search: ")
            results = self.search_phrase(phrase)
            if results:
                print("\n--- Phrase Search Results (Positional Index) ---")
                for idx, url in enumerate(results, 1):
                    print(f"{idx}. {url}")
            else:
                print("\nNo results found for the phrase.")
        else:
            print("Invalid choice.")


if __name__ == "__main__":
    root_url = input("Enter the root URL to start crawling: ")
    depth = int(input("Enter the crawl depth: "))
    engine = SearchEngine()
    engine.crawl(root_url, depth)
    print(f"\nCrawl complete. Indexed {len(engine.visited)} pages.")

    # Display Positional Index
    engine.display_positional_index()

    # Run the demo
    engine.run_demo()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Term: catalog
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5392, 10736]
Term: maintain
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5393, 6104, 6302, 9023]
Term: oscar
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5395, 5397, 10721, 10723]
Term: nierstrasz
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5396, 5398, 10722, 10724]
Term: geneva
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5400, 5402]
Term: wrote
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5403]
Term: perl
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5405, 5406, 5449, 5450, 8288]
Term: period
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5408, 9206]
Term: mirror
  Document: https://en.wikipedia.org/wiki/Web_search_engine, Positions: [5409, 152