<a href="https://colab.research.google.com/github/liorZucker11/cloud-computing/blob/main/create_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from nltk.stem import PorterStemmer
import re
import json

In [2]:
def fetch_page_and_sublinks(url, max_sublinks=50):
    results = {}
    visited_urls = set()

    def fetch(url):
        if len(visited_urls) >= max_sublinks:
            return
        if url[-1] != "/":
          url += "/"
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            results[url] = soup
            visited_urls.add(url)
            # Find all links in the soup object
            for link in soup.find_all('a', href=True):
              suffix = link.get('href')
              if suffix == "#main":
                suffix = ""
              full_url = urljoin(url, suffix)
              if "azure" not in full_url:
                continue
              if full_url not in visited_urls:
                  fetch(full_url)  # Recursively fetch sublinks
                  if len(visited_urls) >= max_sublinks:
                    return
        else:
            results[url] = None

    fetch(url)
    return results

In [None]:
# Example usage
url = "https://azure.microsoft.com/en-us"  # Replace with the URL you want to fetch
data = fetch_page_and_sublinks(url)
print(data.keys())  # This will print the URLs of the main page and sublinks fetched

dict_keys(['https://azure.microsoft.com/en-us/', 'https://portal.azure.com/', 'https://azure.microsoft.com/en-us/free/', 'https://azure.microsoft.com/en-us/contact/', 'https://azure.microsoft.com/en-us/explore/', 'https://azure.microsoft.com/en-us/explore/global-infrastructure/', 'https://azure.microsoft.com/en-us/solutions/cloud-economics/', 'https://azure.microsoft.com/en-us/solutions/cloud-enablement/', 'https://azure.microsoft.com/en-us/case-studies/', 'https://azure.microsoft.com/en-us/products/', 'https://azure.microsoft.com/en-us/products/virtual-machines/', 'https://azure.microsoft.com/en-us/products/virtual-desktop/', 'https://azure.microsoft.com/en-us/products/azure-sql/', 'https://azure.microsoft.com/en-us/products/cosmos-db/', 'https://azure.microsoft.com/en-us/products/kubernetes-service/', 'https://azure.microsoft.com/en-us/products/ai-services/', 'https://azure.microsoft.com/en-us/products/app-service/', 'https://azure.microsoft.com/en-us/services/playfab/', 'https://azu

In [3]:
stop_words = {'',' ','a', 'an', 'the','I', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
 'in', 'to', 'for', 'with', 'on', 'at', 'by', 'from', 'up', 'off', 'about', 'into', 'over', 'after',
 'and', 'but', 'or', 'as', 'if', 'when', 'than', 'because', 'while', 'where','be', 'have', 'do', 'is', 'am', 'are', 'was', 'were', 'being', 'been',
 'some', 'such', 'only', 'own', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'}

In [4]:
def contains_forbidden_chars(word):
    forbidden_chars = {'$', '#', '[', ']', '.', '/'}
    for char in word:
        if char in forbidden_chars:
            print(f"found char {char} in word{word}")
            return True
    return False

def index_words(index, html_link, soup,stemmer):
  words = re.findall(r'\w+', soup.get_text())
  for word in words:
    word = word.lower()
    word = stemmer.stem(word)
    if word in stop_words:
      continue
    if contains_forbidden_chars(word):
      continue
    if word in index:
      index[word]["count"] += 1
      if html_link in index[word]["links"]:
        index[word]["links_counter"][index[word]["links"].index(html_link)] += 1
      else:
        index[word]["links"].append(html_link)
        index[word]["links_counter"].append(1)
    else:
      index[word] = {"count": 1, "links": [html_link],"links_counter":[1]}
  return index

def create_index(url):
  stemmer = PorterStemmer()
  dic = fetch_page_and_sublinks(url)
  index = {}
  for html_link in dic:
    html_data = dic[html_link]
    index = index_words(index, html_link ,html_data,stemmer)
  return index

In [None]:
url = 'https://azure.microsoft.com/en-us'
index = create_index(url)
index_with_name = {"index": index}

# The file path where you want to save the JSON data
file_path = 'data.json'

# Writing the dictionary to a file as JSON
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(index_with_name, f, ensure_ascii=False, indent=4)
