In [None]:
import time
import re
import json
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# ------------------------------------------
# 1) SCRAPE LATEST TOPICS USING SELENIUM
# ------------------------------------------
driver = webdriver.Chrome()  # or webdriver.Firefox(), etc.
driver.get("https://community.seqera.io/latest")  # The page we'll scrape

#Option 1 : https://opennms.discourse.group/latest
#Option 2 : https://help.galaxyproject.org/latest
#Option 3 : https://cwl.discourse.group/latest


# Scroll to load all topics
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait a bit for new content to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get the complete page source after scrolling
page_source = driver.page_source
driver.quit()

soup = BeautifulSoup(page_source, "html.parser")
tbody = soup.find("tbody", {"class": "topic-list-body"})

# Safety check: if tbody not found, handle gracefully
if not tbody:
    print("No topics found. Exiting.")
    exit()

rows = tbody.find_all("tr")

# Prepare lists (or skip direct lists if you'll store data in dictionaries)
titles = []
hrefs = []
replies_list = []


for row in rows:
    # 1) Title + href
    link_tag = row.find("a", class_="title raw-link raw-topic-link")
    if link_tag:
        title_text = link_tag.get_text(strip=True)
        title_href = link_tag.get("href")
        titles.append(title_text)
        hrefs.append(title_href)
    else:
        titles.append(None)
        hrefs.append(None)

    # 2) Number of replies
    button_tag = row.find("button", class_="btn-link posts-map badge-posts")
    if button_tag:
        span_tag = button_tag.find("span", class_="number")
        replies_list.append(span_tag.get_text(strip=True) if span_tag else None)
    else:
        replies_list.append(None)


# ------------------------------------------
# 2) BUILD A LIST OF TOPIC DICTIONARIES
# ------------------------------------------
topics_data = []
for i in range(len(titles)):
    title = titles[i]
    link = hrefs[i]
    replies = replies_list[i]


    if link is None:
        continue  # Skip if no link found

    # The link is something like: /t/slug-title/1578
    # We can extract the numeric ID from the tail of the URL
    # e.g. 1578 from '/t/some-topic/1578'
    post_id = None
    match = re.search(r'/(\d+)$', link)
    if match:
        post_id = match.group(1)

    # Store partial data now; we’ll add the 'body' in the next step
    topic_info = {
        "title": title,
        "relative_link": link,
        "post_id": post_id,
        "replies": replies,
        "created_date": created,
        "latest_date": latest
    }
    topics_data.append(topic_info)


In [None]:
# ------------------------------------------
# 3) SAVE RESULTS TO JSON
# ------------------------------------------
output_filename = "seqera.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(topics_data, f, ensure_ascii=False, indent=4)

print(f"\nDone! Scraped {len(topics_data)} topics. See '{output_filename}'.")

In [None]:
#Script to extract the body of the post

In [None]:
# import json
# import time
# import re

# from selenium import webdriver
# from bs4 import BeautifulSoup

# # 1) Load the JSON data you already have
# with open("forum_half_body.json", "r", encoding="utf-8") as f:
#     topics_data = json.load(f)

# # 2) Filter out the topics that you still need to scrape
# #    For example, skip those that already have a body or whose post_id <= 1581
# topics_to_scrape = []
# for t in topics_data:
#     # If 'body' is None or empty, and post_id is greater than 1581
#     # (Adjust the condition as you prefer.)
#     if (not t.get("body")) and t.get("post_id"):
#         try:
#             if int(t["post_id"]) > 1581:
#                 topics_to_scrape.append(t)
#         except ValueError:
#             pass  # handle if post_id is not an integer string

# print(f"Total topics loaded: {len(topics_data)}")
# print(f"Topics to scrape now: {len(topics_to_scrape)}")

# # 3) Use Selenium to scrape only the remaining topics
# driver = webdriver.Chrome()
# driver.implicitly_wait(30)

# base_url = "https://community.seqera.io"

# for topic in topics_to_scrape:
#     full_url = base_url + topic["relative_link"]
#     driver.get(full_url)
#     time.sleep(2)  # Adjust or use explicit waits

#     page_source_post = driver.page_source
#     soup_post = BeautifulSoup(page_source_post, "html.parser")
#     cooked_div = soup_post.find('div', class_='cooked')

#     if cooked_div:
#         body_text = cooked_div.get_text(separator="\n", strip=True)
#         topic["body"] = body_text
#         print(f"Scraped body for post_id={topic['post_id']}")
#     else:
#         topic["body"] = None
#         print(f"No 'cooked' div found for post_id={topic['post_id']}")

# # Done scraping, close driver
# driver.quit()

# # 4) Save updated data back into the JSON
# with open("seqera_topics.json", "w", encoding="utf-8") as f:
#     json.dump(topics_data, f, ensure_ascii=False, indent=4)

# print("Partial scraping complete. JSON updated.")
