## Solution-1

In [2]:
from bs4 import BeautifulSoup
import requests

url = "https://aimsammi.org/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extraire les descriptions dans la section "About"
about_section = soup.select_one('div#about-section')
if about_section:
    description = about_section.get_text(strip=True)
    print("About Description:", description)

# Extraire les titres des nouvelles
news_items = soup.select('div.news-item h2')
for news in news_items:
    print("News Title:", news.get_text(strip=True))

## Solution-2

In [13]:
import requests
from bs4 import BeautifulSoup

# Liste des URLs à analyser
urls = [
    'https://aimsammi.org/',
    'https://aimsammi.org/about-ammi-2/',
    'https://aimsammi.org/admission-2/',
    'https://aimsammi.org/events/',
    'https://aimsammi.org/blog-2/',
]

# Fonction pour analyser une page
def analyse_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Vérifie si la requête a réussi
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')

        print(f"Analyse de la page: {url}")

        # Titre de la page
        title = soup.title.string if soup.title else "Pas de titre"
        print("Titre de la page :", title)

        # Headers
        headers = soup.find_all(['h1', 'h2', 'h3'])
        for header in headers:
            print(header.name, header.text.strip())

        # Paragraphes
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            print(p.text.strip())

        # Liens et images
        links = soup.find_all('a')
        for link in links:
            print("Lien :", link.get('href'), "Texte :", link.text.strip())

        images = soup.find_all('img')
        for img in images:
            print("Image :", img.get('src'), "Alt text :", img.get('alt'))

        print("\n")  # Ligne vide pour séparer les analyses des pages
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de l'accès à {url}: {e}")

# Parcours de toutes les pages et analyse
for url in urls:
    analyse_page(url)


Analyse de la page: https://aimsammi.org/
Titre de la page : Home Page - African Masters Of Machine Intelligence
h1 Home Page
h1 Become a Machine Intelligence Expert
h2 Improve the World
h2 In Partnership with
h2 Welcome to the African Master’s in Machine Intelligence (AMMI)
h3 Latest News
h3 Upcoming Events
AMMI is a novel fully funded one-year intensive graduate program that provides brilliant young Africans with state-of-the-art training in machine learning and its applications.
The AMMI program prepares well rounded machine intelligence researchers who respond to both present and future needs of Africa and the world.
We invite all interested students to apply.
The African Institute for Mathematical Sciences (AIMS) is a Pan-African network of centers of excellence for post-graduate training, research and public engagement in mathematical sciences. We enable Africa’s brightest students to become innovators that propel scientific, educational and economic self-sufficiency.

The Africa

## Solution-3

In [15]:
import requests
from bs4 import BeautifulSoup
import json

# List of URLs to analyze
urls = [
    'https://aimsammi.org/',
    'https://aimsammi.org/about-ammi-2/',
    'https://aimsammi.org/admission-2/',
    'https://aimsammi.org/events/',
    'https://aimsammi.org/blog-2/',
]

# Function to analyze a page and convert it to JSON
def analyze_page_to_json(url):
    page_data = {
        'url': url,
        'title': '',
        'headers': [],
        'paragraphs': [],
        'links': [],
        'images': []
    }
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Title of the page
        page_data['title'] = soup.title.string if soup.title else "No title"

        # Headers
        headers = soup.find_all(['h1', 'h2', 'h3'])
        for header in headers:
            page_data['headers'].append({
                'tag': header.name,
                'text': header.text.strip()
            })

        # Paragraphs
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            page_data['paragraphs'].append(p.text.strip())

        # Links
        links = soup.find_all('a')
        for link in links:
            page_data['links'].append({
                'href': link.get('href'),
                'text': link.text.strip()
            })

        # Images
        images = soup.find_all('img')
        for img in images:
            page_data['images'].append({
                'src': img.get('src'),
                'alt': img.get('alt')
            })

    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")

    return page_data

# Analyze all pages and save results in JSON format
all_pages_data = []

for url in urls:
    page_data = analyze_page_to_json(url)
    all_pages_data.append(page_data)

# Save the results to a JSON file
with open('website_structure.json', 'w') as json_file:
    json.dump(all_pages_data, json_file, indent=4)

print("Website structure has been saved to 'website_structure.json'")

Website structure has been saved to 'website_structure.json'


## Solution-4

In [21]:
from bs4 import BeautifulSoup
import requests
from langchain_community.chat_models import ChatOllama

# Function to fetch and parse HTML content from a URL
def fetch_html_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

# Function to initialize the Ollama model
def initialize_model(model_name):
    try:
        return ChatOllama(model=model_name)
    except Exception as e:
        print(f"Error initializing the model: {e}")
        return None

# Function to extract and preprocess HTML content
def preprocess_html_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Extract text from specific tags or entire content
    return soup.get_text(separator='\n', strip=True)

# Function to create a prompt for the Ollama model
def create_prompt(html_text):
    prompt = """
    By analyzing this file, propose a prompt that I can use for extracting all relevant information 
    for a RAG system using this website as an external source.
    """
    # Limit the length of text if necessary to fit within model constraints
    return f"{prompt}\n\nHTML Content:\n{html_text[:10000]}"

# Function to query the model and handle response
def query_model(model, prompt):
    try:
        response = model(prompt)
        return response
    except Exception as e:
        print(f"An error occurred during model query: {e}")
        return None

# Main function to orchestrate the process
def main(url, model_name):
    html_content = fetch_html_content(url)
    if html_content:
        text_content = preprocess_html_content(html_content)
        full_prompt = create_prompt(text_content)
        model = initialize_model(model_name)
        if model:
            response = query_model(model, full_prompt)
            if response:
                print(response)
            else:
                print("Failed to get a response from the model.")
        else:
            print("Model initialization failed.")
    else:
        print("Failed to fetch or parse HTML content.")

# URL of the website and model name
url = "https://aimsammi.org/"
model_name = "mistral"

# Run the main function
main(url, model_name)



An error occurred during model query: Received unsupported message type for Ollama.
Failed to get a response from the model.
