# Extracting and Converting Articles about LoRa and LoRaWAN from The Things Network

This notebook demonstrates the process of extracting and converting articles from The Things Network website into Markdown format and saving them locally.

In [1]:
import requests
import os
import time
import re
from typing import Optional
from typing import Tuple
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag, NavigableString
from markdownify import markdownify as md

### Configuring Directories and Constants
Setting up the URL of the host, base directory for saving articles, and delay between requests.

In [2]:
# LoRa Host Page URL, base directory for saving articles, and delay between requests
host = "https://www.thethingsnetwork.org/"
base_dir = "data/articles/"
delay_seconds = 4

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

### Get Article Page
We need retrieves the article page from a given URL and returns the status code and response text as a tuple of strings.

In [3]:
# Function to get the article page from URL and return status code and response text as tuple of strings
def get_article_page(
    url: str,
    delay_seconds: int = 30,
    headers: Optional[dict[str, str]] = None,
    encoding: str = "utf-8",
    timeout: int = 30,
) -> Tuple[str, str]:
    if headers is None:
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br, zstd",
            "Accept-Language": "es-419,es;q=0.7",
            "Cache-Control": "max-age=0",
            "Cookie": "csrftoken=AmCKlYZ5ypxswQEXbhOJ04v7RRHCxk171Ldc8l3SqDhKC7PtvPfah5jQtsp8ITwQ",
            "If-Modified-Since": "Tue, 25 Jun 2024 17:50:49 GMT",
            "If-None-Match": 'W/"667b0379-73fd"',
            "Priority": "u=0, i",
            "Sec-Ch-Ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Brave";v="126"',
            "Sec-Ch-Ua-Mobile": "?0",
            "Sec-Ch-Ua-Platform": '"macOS"',
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Sec-Gpc": "1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        }
    # Get Response from URL and return status code and response text
    response = requests.get(url, headers=headers, timeout=timeout)
    time.sleep(delay_seconds)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text


# Get Response from URL with raise exception if status code is not 200
def get_response_from_url(url: str, delay_seconds: int = 30) -> str:
    status_code, response = get_article_page(url, delay_seconds)
    if status_code != 200:
        raise Exception(f"Failed to get response from {url}")
    return response

### Function to Get URLs of Articles
This function help to retrieves all article URLs from the host page and returns them in an array.

In [4]:
# Get the URLs of the articles on the host page and save them to a array
def get_articles_url(base_url):
    response = get_response_from_url(base_url, delay_seconds)
    soup = BeautifulSoup(response, "html.parser")
    menu_container = soup.find("ul", class_="menu-list")
    link_containers = menu_container.find_all("li")
    urls = [
        urljoin(base_url, a["href"])
        for link_container in link_containers
        for a in link_container.find_all("a")
    ]
    return urls

In [5]:
# Get URLs from the host page and save them to an array
dir_url = f"{host}docs/lorawan/what-is-lorawan/"
urls = get_articles_url(dir_url)
print(urls)

['https://www.thethingsnetwork.org/docs/lorawan/what-is-lorawan/', 'https://www.thethingsnetwork.org/docs/lorawan/architecture/', 'https://www.thethingsnetwork.org/docs/lorawan/regional-parameters/', 'https://www.thethingsnetwork.org/docs/lorawan/lorawan-relay/', 'https://www.thethingsnetwork.org/docs/lorawan/message-types/', 'https://www.thethingsnetwork.org/docs/lorawan/security/', 'https://www.thethingsnetwork.org/docs/lorawan/classes/', 'https://www.thethingsnetwork.org/docs/lorawan/end-device-activation/', 'https://www.thethingsnetwork.org/docs/lorawan/spreading-factors/', 'https://www.thethingsnetwork.org/docs/lorawan/adaptive-data-rate/', 'https://www.thethingsnetwork.org/docs/lorawan/limitations/', 'https://www.thethingsnetwork.org/docs/lorawan/frequencies-by-country/', 'https://www.thethingsnetwork.org/docs/lorawan/frequency-plans/', 'https://www.thethingsnetwork.org/docs/lorawan/duty-cycle/', 'https://www.thethingsnetwork.org/docs/lorawan/glossary/', 'https://www.thethingsnet

### Clean HTML content
We need to clean the HTML content that we have extracted so that we can only have the information necessary for our project.

In [6]:
# Function to clean the HTML
def clean_html(soup):
    content_container = soup.find("div", class_="content")
    if content_container:
        # Remove images, header hashes, figure descriptions, iframes, questions, and paragraphs from the content
        for img_content in content_container.find_all("img"):
            img_content.decompose()
        for header_hash in content_container.find_all("a", class_="header-hash"):
            header_hash.decompose()
        for figure_description in content_container.find_all("em"):
            figure_description.decompose()
        iframes = content_container.find_all("iframe")
        for iframe in iframes:
            src = iframe.get("src")
            iframe.replace_with(f"Video: {src}\n")
        question_title = content_container.find(id="questions")
        if question_title:
            question_title.decompose()
        question_container = content_container.find("ol")
        if question_container:
            for question in question_container.find_all("li"):
                question.decompose()

        # Get the text from the paragraphs and replace the paragraphs with the text
        paragraphs = content_container.find_all("p")
        for paragraph in paragraphs:
            new_text = paragraph.get_text(" ", strip=True)  # Add space between words
            paragraph.replace_with(new_text)

    # Clean title text and replace with markdown header
    title = soup.find("h1", class_="title")
    if title:
        title_text = title.get_text(" ", strip=True)
        title.replace_with(f"# {title_text}\n")

    # Remove the clearfix containers
    clearfix_containers = soup.find_all("div", class_="is-clearfix")
    for clearfix_container in clearfix_containers:
        clearfix_container.decompose()

    # Return the cleaned text
    return soup

### Convert HTML content to Markdown and extract the information
Now we need to convert the HTML content to Markdown format using the Markdownify library. First we will create the function for the conversion and then we will extract the information from the web page and save it in "md" files.

In [7]:
# Function to convert HTML content to Markdown
def convert_to_markdown(html_content):
    markdown_text = md(html_content, heading_style="ATX")
    markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
    markdown_text = re.sub(r' +\n', '\n', markdown_text)
    markdown_text = markdown_text.strip()
    
    return markdown_text

In [8]:
# Function to extract information from the URL
def extract_information(url):
    try:
        response = get_response_from_url(url, delay_seconds)
        soup = BeautifulSoup(response, "html.parser")
        content = soup.find("div", class_="docs-content")
        if content:
            cleaned_content = clean_html(content)
            markdown_content = convert_to_markdown(str(cleaned_content ))
            return markdown_content
        return "Error: No content found"
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

In [9]:
# Function to save the content to a markdown file
def create_markdown_file(content, filename):
    with open(filename, "w") as file:
        file.write(content)

In [10]:
# Extract information from the URLs and save them to markdown files
for idx, url in enumerate(urls):
    content = extract_information(url)
    url_title = url.split("/")[-2]
    path = f"{base_dir}{idx+1}-{url_title}.md"
    if os.path.exists(path):
        continue
    print(path)
    create_markdown_file(content, path)

In [16]:
# Define the path of the markdown file where all content will be stored
combined_file_path = f"data/articles/lorawan-information.md"

# Open the file in append mode
with open(combined_file_path, "w") as file:
    # Extract information from the URLs and append them to the markdown file
    for url in urls:
        content = extract_information(url)
        # Remove leading backslash from content
        content = content.lstrip("\\")
        file.write(content)