<a href="https://colab.research.google.com/github/kkech/ITC6008_MidTerm/blob/main/MidTermSearchEngines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from sentence_transformers import SentenceTransformer, util
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

def crawl_and_save(base_url, folder_name, max_depth=1, relevance_threshold=0.7):
    visited = set()
    document_count = 0

    domain = base_url.split("//")[1].split("/")[0]
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }


    print("Loading bert model...")
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    covid_context = "COVID-19 pandemic information, coronavirus symptoms, prevention measures, and outbreak statistics."
    covid_embedding = model.encode(covid_context, convert_to_tensor=True)

    def crawl(url, depth):
        nonlocal document_count
        if depth > max_depth or url in visited:
            return
        visited.add(url)

        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                page_text = soup.get_text(separator=' ', strip=True)

                sentences = nltk.sent_tokenize(page_text)

                relevant_sentences = []
                for sentence in sentences:
                    sentence_embedding = model.encode(sentence, convert_to_tensor=True)
                    similarity = util.pytorch_cos_sim(sentence_embedding, covid_embedding).item()
                    if similarity >= relevance_threshold:
                        relevant_sentences.append(sentence)

                if relevant_sentences:
                    print(f"Relevant content found on: {url}")  # Print only if relevant
                    document_count += 1
                    file_name = f"document_{document_count}.txt"
                    file_path = os.path.join(folder_name, file_name)

                    with open(file_path, "w", encoding="utf-8") as file:
                        file.write(f"url: {url}\n\n")
                        file.write("\n".join(relevant_sentences))

                for link in soup.find_all('a', href=True):
                    full_url = urljoin(url, link['href'])
                    if domain in full_url and re.match(r'^https?:\/\/', full_url):
                        crawl(full_url, depth + 1)
        except Exception as e:
            print(f"Error crawling {url}: {e}")

    os.makedirs(folder_name, exist_ok=True)
    crawl(base_url, depth=0)
    return document_count

# URLs to crawl
who_url = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019"
un_url = "https://www.un.org/en/coronavirus"

print("Crawling who website...")
who_folder = "WHO_Crawled_Data"
who_document_count = crawl_and_save(who_url, who_folder, max_depth=2, relevance_threshold=0.75)
print(f"who crawling completed. Total documents: {who_document_count}")

print("Crawling un website...")
un_folder = "UN_Crawled_Data"
un_document_count = crawl_and_save(un_url, un_folder, max_depth=2, relevance_threshold=0.75)
print(f"UN crawling completed. Total documents: {un_document_count}")

total_documents = who_document_count + un_document_count
print(f"Total documents: {total_documents}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Crawling who website...
Loading bert model...
Relevant content found on: https://www.who.int/emergencies/diseases/novel-coronavirus-2019
Relevant content found on: https://www.who.int/emergencies/diseases/novel-coronavirus-2019#content
Relevant content found on: https://www.who.int/mega-menu/health-topics/popular/mental-disorders
Relevant content found on: https://www.who.int/mega-menu/emergencies/emergencies/coronavirus-disease-(covio-19)
Relevant content found on: https://www.who.int/mega-menu/emergencies/who-in-emergencies/operations
Relevant content found on: https://www.who.int/mega-menu/emergencies/who-in-emergencies/partners
Relevant content found on: https://www.who.int/mega-menu/data/dashboards/covid-19-dashboard
Relevant content found on: https://www.who.int/mega-menu/data/data-collection/civil-registration-and-vital-statistics
Relevant content found on: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/covid-19-policy-briefs
Relevant content found on: https://w

In [4]:
import requests
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

def extract_text_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract main text content
        text = soup.get_text(separator=' ', strip=True)
        return text
    else:
        print(f"Failed to fetch the url: {url}")
        return None

def preprocess_text(text):

    text = ' '.join(text.split())

    stop_words = set(stopwords.words('english'))

    nltk.download('wordnet')
    lemmatizer = nltk.stem.WordNetLemmatizer()

    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalnum() and word not in stop_words and len(word) > 2
    ]

    domain_specific_noise = {'read', 'download', 'page', 'section'}
    filtered_tokens = [word for word in filtered_tokens if word not in domain_specific_noise]

    return ' '.join(filtered_tokens), filtered_tokens

def get_top_tfidf_words(text, num_words=20):

    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]

    word_scores = list(zip(feature_names, tfidf_scores))
    sorted_word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)

    return [word for word, score in sorted_word_scores[:num_words]]

# Function to create a vector representation
def vectorize_text(tokens, vocabulary):
    word_counts = Counter(tokens)
    vector = [word_counts.get(word, 0) for word in vocabulary]

    return vector

# URL to process
url = "https://www.who.int/health-topics/coronavirus#tab=tab_1"

print("Extracting text from the url...")
text = extract_text_from_url(url)
if text:
    print("\nTEXT EXTRACTED:")
    print(text[:1000], "...")

    print("\nPreprocessing text...")
    cleaned_text, tokens = preprocess_text(text)

    # Step 3: Compute TF-IDF and select top words
    print("\nSelecting top words using TF-IDF...")
    top_words = get_top_tfidf_words(cleaned_text, num_words=20)
    print(f"Top 20 Words (TF-IDF): {top_words}")

    # Step 4: Create vector representation
    print("\nCreating vector representation")
    vector = vectorize_text(tokens, top_words)
    print(f"Vector representation: {vector}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Extracting text from the url...

TEXT EXTRACTED:
Coronavirus Skip to main content Global Regions WHO Regional websites Africa Americas South-East Asia Europe Eastern Mediterranean Western Pacific When autocomplete results are available use up and down arrows to review and enter to select. Select language Select language English العربية 中文 Français Русский Español Donate Donate Home Health Topics All topics A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Resources Fact sheets Facts in pictures Multimedia Podcasts Publications Questions and answers Tools and toolkits Popular Dengue Endometriosis Excessive heat Herpes Mental disorders Mpox Countries All countries A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Regions Africa Americas Europe Eastern Mediterranean South-East Asia Western Pacific WHO in countries Data by country Country presence Country strengthening Country cooperation strategies Newsroom All news News releases Statements Campaigns Events Feature stories Press confe

[nltk_data] Downloading package wordnet to /root/nltk_data...



Selecting top words using TF-IDF...
Top 20 Words (TF-IDF): ['2024', 'health', 'disease', 'october', 'group', 'coronavirus', 'symptom', 'country', 'emergency', 'epidemiological', 'pandemic', 'report', 'vaccine', 'advisory', 'news', 'september', 'virus', 'global', 'home', 'people']

Creating vector representation
Vector representation: [28, 21, 13, 12, 11, 9, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5]
