In [3]:
import json
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urljoin
import re
import io
import requests
from docx import Document
import pytesseract
import pdf2image
from langchain.text_splitter import TokenTextSplitter
from sentence_transformers import SentenceTransformer



In [4]:
base_url = 'https://insea.ac.ma'
visited_urls = set()

In [5]:
def cleanText(text):
    return text.replace("\n", " ").replace("\t", " ").replace("\r", " ").replace("\f", " ").strip()

In [6]:
def extract_content_from_document(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Get the content type of the response
    content_type = response.headers['content-type']
    # Create a BytesIO object from the response content
    content = io.BytesIO(response.content)

    if 'application/pdf' in content_type:
        # If the content is a PDF, convert it to an image using pdf2image
        images = pdf2image.convert_from_bytes(content.read())
        # Use pytesseract to OCR the image
        text = '\n'.join(pytesseract.image_to_string(image) for image in images)
    elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in content_type:
        # If the content is a Word document, use python-docx to read it
        doc = Document(content)
        text = '\n'.join(paragraph.text for paragraph in doc.paragraphs)
    else:
        text = None

    return text

In [7]:
def extract_information(url):
    # Create a dictionary to store information for the current page
    web_page_info = {
        "url": url,
        "title": None,
        "text": None,  # Initialize text as None
        "source_type": "webpage"  # Set the source type to "webpage"
    }
    
    documents_info = []
    
    try:
        # Make an HTTP request to the website
        r = requests.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract the title of the page
        title = soup.title.string if soup.title else None
        
        # Extract paragraphs and their contents
        paragraphs = soup.find_all('p')
        web_text = ' '.join([re.sub(r'\s+', ' ', p.get_text().strip()) for p in paragraphs])
        web_text_cleane = cleanText(web_text)
        
        web_page_info["title"] = title
        web_page_info["text"] = web_text_cleane
        
        # Extract links to documents (PDF, Word) and their reference links
        for link in soup.find_all('a'):
            link_url = link.get('href')
            if link_url:
                # Join the URL to make it absolute
                absolute_link_url = urljoin(base_url, link_url)
                if link_url.endswith(('.pdf', '.docx')):  # You can add more extensions
                    # Extract reference links 
                    document_reference_link = absolute_link_url
                    # Extract the content of the document
                    document_text = cleanText(extract_content_from_document(absolute_link_url))  
                    # Create a document dictionary
                    document_info = {
                        "url": document_reference_link,
                        "title": title,
                        "text": document_text,
                        "source_type": link_url.split('.')[-1].upper()  # Extract document type
                    }
                    documents_info.append(document_info)

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        web_page_info = None
        documents_info = None
        
    return web_page_info, documents_info

In [8]:
def crawl(url):
    # Skip this URL if it has already been visited
    if url in visited_urls:
        return

    # Parse the URL to compare domains
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)
    

    # Skip this URL if it's not related to the base URL or is from an external domain
    if parsed_url.netloc != parsed_base_url.netloc:
        return

    # Mark this URL as visited
    visited_urls.add(url)

    try:
        # Make an HTTP request to the website
        r = requests.get(url)
        r.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(r.text, 'html.parser')

        
        # Extract information from the page and save it to the JSON file
        web_page_info, documents_info = extract_information(url)
        if web_page_info:
            if web_page_info["text"]:
                data.append(web_page_info)
             
        if documents_info:
            for document_info in documents_info:
                if document_info["text"]:
                    data.append(document_info)

        
        # Find all links on the page
        links = soup.find_all('a')
        
        # Follow each link
        for link in links:
            href = link.get('href')
            if href:
                absolute_url = urljoin(base_url, href)
                # Recursively crawl this link
                crawl(absolute_url)

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

In [None]:
data = []
crawl(base_url)