In [26]:
import os               # Provides access to operating system-dependent functionality
import openai           # OpenAI's GPT-3 language model library
import requests         # Library for making HTTP requests
import httpx            # Library for making asynchronous HTTP requests
import spacy            # Library for natural language processing (NLP)
from bs4 import BeautifulSoup  # Library for web scraping and parsing HTML/XML documents
import asyncio

from dotenv import load_dotenv
load_dotenv()


True

In [None]:
from spacy.cli import download
download("en_core_web_sm")

In [31]:
async def generate_summary_chunk(chunk):
    # Define the conversation messages for the GPT-3 model
    messages = [
        {"role": "system", "content": "You are an AI language model tasked with summarizing articles in bullet points."},
        {"role": "user", "content": f"Here's an article chunk to summarize:\n\n{chunk}\n\n"},
        {"role": "user", "content": "Provide the most interesting and important elements in an easy to understand way."}
    ]
    
    summary = "-----------------"
    try:
        # Use an asynchronous HTTP client to make a POST request to the OpenAI API
        async with httpx.AsyncClient() as client:
            response = await client.post(
                    "https://api.openai.com/v1/chat/completions",  # API endpoint
                    json={
                        "model": "gpt-3.5-turbo-0301",  # Model name
                        "messages": messages,  # Conversation messages
                        "max_tokens": 100,  # Maximum number of tokens in the response
                        "temperature": 0.9,  # Sampling temperature
                        "n": 1,  # Number of completions to generate
                        "stream": False,  # Streaming mode
                        "stop": None,  # Stop sequence
                    },
                    headers={
                        "Content-Type": "application/json",
                        "Authorization": f"Bearer {openai.api_key}",  # API key for authorization
                },
            )
        response_data = response.json()
        summary = response_data['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(e)

    return summary  # Return the summary text



In [4]:
def fetch_html(url: str) -> str:
    response = requests.get(url)  # Make an HTTP GET request to the URL
    return response.text          # Return the text content of the response


In [8]:
def extract_text(url_content):
    soup = BeautifulSoup(url_content, "html.parser")  # Parse the HTML content
    text_parts = []  # Initialize an empty list to store text parts
    # Iterate over all <p> and <div> elements in the HTML and extract their text content
    for p in soup.find_all(["p", "div"]):
        text_parts.append(p.text)
    # Join the text parts with newline characters and return the result
    return "\n".join(text_parts)

In [10]:
nlp = spacy.load("en_core_web_sm")


In [11]:
# Define a function to extract keywords from a text string
def extract_keywords(text, num_keywords=5):
    doc = nlp(text)  # Process the text using the spaCy language model
    keywords = []  # Initialize an empty list to store keywords
    # Iterate over named entities in the text and extract keywords based on entity labels
    for ent in doc.ents:
        if ent.label_ in ["ORG", "PERSON", "GPE", "NORP"]:
            keywords.append(ent.text)
    # Iterate over tokens in the text and extract keywords based on part-of-speech tags
    for token in doc:
        if token.is_stop or token.is_punct:
            continue  # Skip stop words and punctuation
        if token.pos_ in ["NOUN", "ADJ", "VERB"] and len(keywords) < num_keywords:
            keywords.append(token.text)
    return keywords



In [3]:
print(os.environ.get("INIT_ENV"))

123


In [32]:
url = "https://docs.espressif.com/projects/esp-idf/en/latest/esp32s3/security/secure-boot-v2.html"
url_content = fetch_html(url)  # Fetch the HTML content of the URL
article = extract_text(url_content)  # Extract the text content from the HTML
keywords = extract_keywords(article)  # Extract keywords from the article text
print(set(keywords))
print("\n")
# generate_summary_chunk([""])

chunk_size = 2800  # Define the maximum size of each article chunk
# Split the article into chunks based on the defined chunk size
article_chunks = [article[i:i + chunk_size] for i in range(0, len(article), chunk_size)]

# Use concurrency to process chunks simultaneously and generate summaries for each chunk
for chunk in article_chunks:
    x = await generate_summary_chunk(chunk)
    print(x)

# summaries = [await generate_summary_chunk(chunk) for chunk in article_chunks]
# final_summary = "\n".join(summaries)
# print(final_summary)
# print("\n")


{'1196', 'JTAG Debugging', 'Fatal Errors', 'Espressif Systems', 'Apps', 'eFuse', 'Flash Encryption', 'Reproducible Builds', 'Thread Local Storage', 'RSA Signature', 'the Project Configuration Menu', 'KEY_PURPOSE_X. Stores', 'PSS', 'Device Firmware Update', 'Windows', 'BINARY_FILE', 'Signatures\nSigning', 'Description', 'Get Started', 'KEY_DIGESTs', 'HSM', 'KEY_PURPOSE_X', 'Linker Script Generation', 'Generate', 'Contributions Guide', 'PDF\n\n\n\n\n\n\n\n\n\n ', 'RSA\nEnable', 'C++ Support', 'Offset\nSize', 'CONFIG_SECURE_BOOT_ALLOW_UNUSED_DIGEST_SLOTS', 'ESP32-S3', 'esptool.py', 'Remote Signing', 'Keyfile', 'Building', 'RSA-PSS Signature', 'Secure Boot verification', 'Bootloader Size', 'Signing Schemeâ\x80\x9d', 'OTA', 'my_secure_boot_signing_key.pem 3072', 'OpenThread', 'SECURE_BOOT_DIGEST2', 'Hardware Security Module', 'Firmware Upgrade', 'MMU', 'OSX/Linux', 'KB', 'RSA', 'RSA Public Modulus', 'CONFIG_SECURE_BOOT_INSECURE', 'Secure Boot V2', 'Copyrights', 'Server', 'RSA-PSS', 'byte', 