# LoRa and LoRaWAN FAQ Data Extraction and Conversion

This notebook extracts frequently asked questions and answers (FAQs) about LoRa and LoRaWAN from the Semtech website and saves them in a JSON and CSV file.

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import os
import time
import re
from typing import Optional
from typing import Tuple
from urllib.parse import urljoin
import pandas as pd

### Directory and Constant Configuration
We configure the host URL, the base directory to save the questions and answers, and the delay time between requests.

In [7]:
# LoRa FAQ Host Page URL, base directory for saving Questions and Answers, and delay between requests
host = "https://www.semtech.com/"
base_dir = "data/faq/"
delay_seconds = 4

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

### Feature to Get FAQ Page
This function gets the article page from a URL and returns the status code and response text as a string tuple.

In [3]:
# Function to get the article page from URL and return status code and response text as tuple of strings
def get_faq_page(
    url: str,
    delay_seconds: int = 30,
    headers: Optional[dict[str, str]] = None,
    encoding: str = "utf-8",
    timeout: int = 30,
) -> Tuple[str, str]:
    if headers is None:
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br, zstd",
            "Accept-Language": "es-419,es;q=0.8",
            "Cache-Control": "max-age=0",
            "Cookie": "exp_csrf_token=2f0dbf3dbe7554a8237498d79968436d3f2c8a0e; exp_tracker=%7B%220%22%3A%22assets%2Fbootstrap%2Fjs%2Fpopper.min.js.map%22%2C%221%22%3A%22design-support%2Ffaq%22%2C%222%22%3A%22design-support%2Ffaq%2Fcloud-services%22%2C%223%22%3A%22design-support%2Ffaq%2Ffaq-lora%22%2C%224%22%3A%22design-support%2Ffaq%2Ffaq-lorawan%22%2C%22token%22%3A%229c98ad48092b95092b13c2f834c421efae9be878cb8f1aab7774139f3af2e8c99d6f43b4768af302d22b212d210becc1%22%7D",
            "If-Modified-Since": "Mon, 08 Jul 2024 02:31:31 GMT",
            "Priority": "u=0, i",
            "Referer": "https://www.semtech.com/",
            "Sec-Ch-Ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Brave";v="126"',
            "Sec-Ch-Ua-Mobile": "?0",
            "Sec-Ch-Ua-Platform": '"macOS"',
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-User": "?1",
            "Sec-Gpc": "1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        }
    # Get Response from URL and return status code and response text
    response = requests.get(url, headers=headers, timeout=timeout)
    time.sleep(delay_seconds)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text


# Get Response from URL with raise exception if status code is not 200
def get_response_from_url(url: str, delay_seconds: int = 30) -> str:
    status_code, response = get_faq_page(url, delay_seconds)
    if status_code != 200:
        raise Exception(f"Failed to get response from {url}")
    return response

### Get All Pagination Links
This function gets all pagination links from the main FAQ page.

In [4]:
# Get all pages links of pagination from the host page
def get_all_pages(url: str) -> list[str]:
    response = get_response_from_url(url)
    soup = BeautifulSoup(response, "html.parser")
    # Get the first div component with class pagination
    pagination = soup.find("div", class_="pagination")
    # Get all links from the pagination div
    links_container = pagination.find("div")
    links = links_container.find_all("a")
    # Get the href attribute from each link and return a list of links
    return [link["href"] for link in links] 

In [5]:
# Get URLs from the host page and save them to an array
dir_url = f"{host}design-support/faq"
urls = get_all_pages(dir_url)
print(urls)

['https://www.semtech.com/design-support/faq', 'https://www.semtech.com/design-support/faq/P20', 'https://www.semtech.com/design-support/faq/P40', 'https://www.semtech.com/design-support/faq/P60', 'https://www.semtech.com/design-support/faq/P80', 'https://www.semtech.com/design-support/faq/P100', 'https://www.semtech.com/design-support/faq/P120']


### Extract Data from Questions and Answers
This function extracts the questions and answers from a FAQ page and we also store all the FAQs in a variable so we can later use this when saving it in json format

In [6]:
def extract_data(soup, url):
    faqs_list = []
    faq_item = soup.select(".faq-item")
    for item in faq_item:
        question_tag = item.select_one(".question a")
        answer_tag = item.select_one(".answer")
        if question_tag and answer_tag:
            question = question_tag.get_text(strip=True).replace('\xa0', ' ')
            answer = " ".join(
                p.get_text(strip=True).replace('\xa0', ' ') for p in answer_tag.find_all("p")
            )
            faqs_list.append({"url": url, "question": question, "answer": answer})
    return faqs_list

In [7]:
all_faqs = []

for url in urls:
    response = get_response_from_url(url)
    soup = BeautifulSoup(response, "html.parser")
    faq_list = extract_data(soup, url)
    all_faqs.extend(faq_list)


In [8]:
print(f"Total FAQs: {len(all_faqs)}")

Total FAQs: 133


### Save and Load FAQ to JSON
This functions saves and load the FAQ from a JSON file.

In [9]:
def save_faq_to_json(faq_list, filename):
    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(faq_list, json_file, ensure_ascii=False, indent=4)

path = base_dir + "faq.json"
save_faq_to_json(all_faqs, path)
print(f"FAQs saved to {path}")

FAQs saved to data/faq/faq.json


In [10]:
def load_faq_json(filename):
    with open(filename, "r", encoding="utf-8") as json_file:
        faqs = json.load(json_file)
    return faqs

### Convert JSON to CSV
This function converts FAQ from JSON to CSV. This is so that there can be better management when using the pandas library.

In [4]:
def convert_to_csv(faqs, filename, fieldnames):
    with open(filename, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for faq in faqs:
            writer.writerow(faq)

In [11]:
fieldnames = ["url", "question", "answer"]
json_path = base_dir + "faq.json"
path = base_dir + "faq.csv"
faqs = load_faq_json(json_path)
convert_to_csv(faqs, path, fieldnames)
print(f"FAQs saved to {path}")

FAQs saved to data/faq/faq.csv


In [13]:
# Show the first 5 rows of the CSV file using pandas
df = pd.read_csv(path)
df.head(10)

Unnamed: 0,url,question,answer
0,https://www.semtech.com/design-support/faq,What is the best compromise between spreading ...,Consider the following scenario: the payload d...
1,https://www.semtech.com/design-support/faq,Is there a LoRa IQ Waveform library narrower t...,Lower BW are made by simply playing the wavefo...
2,https://www.semtech.com/design-support/faq,Can I used a long preamble to wake-up devices ...,This method is extremely widespread in the ind...
3,https://www.semtech.com/design-support/faq,Does a LoRaWAN concentrator have built-in GPS ...,Most commercial gateways have either a built-i...
4,https://www.semtech.com/design-support/faq,What are the typical ranges of good and poor v...,Typical Noise Floor is usually close to -120 d...
5,https://www.semtech.com/design-support/faq,What would be the minimal channel spacing for ...,The typical channel spacing for LoRa and LoRaW...
6,https://www.semtech.com/design-support/faq,Where does the CAD Detected interrupt occur in...,The CAD interrupt happens at a determined time...
7,https://www.semtech.com/design-support/faq,It is possible to initialize a packet transmis...,It is not possible to use one of the DIOs of S...
8,https://www.semtech.com/design-support/faq,What is the maximum size of an application pay...,The payload size limitations are identical in ...
9,https://www.semtech.com/design-support/faq,Why is there a latency when a SX1272 is proces...,Between the moment a packet is sent by a senso...


In [14]:
df.tail(20)

Unnamed: 0,url,question,answer
113,https://www.semtech.com/design-support/faq/P100,What are the steps to troubleshoot when two SX...,"First of all, check the frequency offset cause..."
114,https://www.semtech.com/design-support/faq/P100,"How do you choose the LoRa BW, Spreading facto...","LoRaWAN uses primarily the 125kHz BW setting, ..."
115,https://www.semtech.com/design-support/faq/P100,"For a LoRa wide band signal, how can you measu...","If it is just for measurement, you can use the..."
116,https://www.semtech.com/design-support/faq/P100,How to choose a proper crystal for a LoRa device?,"Normally, a +/-10ppm XTAL is good enough for m..."
117,https://www.semtech.com/design-support/faq/P100,How can I implement a mass production test for...,Please visit the blog post:Expert Series: Test...
118,https://www.semtech.com/design-support/faq/P100,How to troubleshoot the output power if it is ...,
119,https://www.semtech.com/design-support/faq/P100,Is it OK to change the mode between FSK and Lo...,"Yes, it is no problem. The LoRa device can be ..."
120,https://www.semtech.com/design-support/faq/P120,Why is the output power of my SX1276 module no...,The +20dBm specification is for the output pow...
121,https://www.semtech.com/design-support/faq/P120,What is the process of the LoRa Channel Activi...,Instead of using a Received Signal Strength In...
122,https://www.semtech.com/design-support/faq/P120,What is the actual Tx power that can be achiev...,Our SX127x product family supports up to +20dB...


In [15]:
# Clean the data by removing the empty rows in the column "answer"
df.dropna(subset=["answer"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.tail(20)


Unnamed: 0,url,question,answer
107,https://www.semtech.com/design-support/faq/P100,How can it be possible to receive a wrong pack...,"In LoRa mode, even if the CRC is wrong the pay..."
108,https://www.semtech.com/design-support/faq/P100,What are the steps to troubleshoot when two SX...,"First of all, check the frequency offset cause..."
109,https://www.semtech.com/design-support/faq/P100,"How do you choose the LoRa BW, Spreading facto...","LoRaWAN uses primarily the 125kHz BW setting, ..."
110,https://www.semtech.com/design-support/faq/P100,"For a LoRa wide band signal, how can you measu...","If it is just for measurement, you can use the..."
111,https://www.semtech.com/design-support/faq/P100,How to choose a proper crystal for a LoRa device?,"Normally, a +/-10ppm XTAL is good enough for m..."
112,https://www.semtech.com/design-support/faq/P100,How can I implement a mass production test for...,Please visit the blog post:Expert Series: Test...
113,https://www.semtech.com/design-support/faq/P100,Is it OK to change the mode between FSK and Lo...,"Yes, it is no problem. The LoRa device can be ..."
114,https://www.semtech.com/design-support/faq/P120,Why is the output power of my SX1276 module no...,The +20dBm specification is for the output pow...
115,https://www.semtech.com/design-support/faq/P120,What is the process of the LoRa Channel Activi...,Instead of using a Received Signal Strength In...
116,https://www.semtech.com/design-support/faq/P120,What is the actual Tx power that can be achiev...,Our SX127x product family supports up to +20dB...


In [16]:
# Save the cleaned data to a new CSV file
path = base_dir + "faq_cleaned.csv"
df.to_csv(path, index=False)
print(f"Cleaned FAQs saved to {path}")

Cleaned FAQs saved to data/faq/faq_cleaned.csv


This Notebook describes the complete process of extracting and converting FAQ data from a web page, saving it to JSON and CSV, and finally displaying the content of the CSV.