In [21]:
###### final code for json format for one board #######
import requests
from bs4 import BeautifulSoup
import re
import json
from langdetect import detect

def extract_details_from_post(post_div):
    text = post_div.get_text().strip()
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    tables = [str(table) for table in post_div.find_all('table')]
    links = [a['href'] for a in post_div.find_all('a', href=True)]
    return {
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None"),
        "table": clean_text(tables[0] if tables else "None"),
        "link": links[0] if links else "None"
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        lang = detect(text)
        return lang == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if not filter_valid_details(details):
                continue
            
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {"information zum forum": []}}

    for page_number in range(8):
        url = base_url + str(page_number * 20) + '.html'
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        threads_table = soup.find('table', {'class': 'table_grid'})
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = thread.find('td', class_='subject').find('a')['href']
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }

            forum_data["board"]["information zum forum"].append(thread_data)

    return forum_data

# The base URL for the forum board
base_url = "https://www.forum.diabetesinfo.de/forum/index.php/board,2."
forum_data = scrape_forum(base_url)

with open('forum_data.json', 'w', encoding='utf-8') as file:
    json.dump(forum_data, file, indent=4, ensure_ascii=False)


In [22]:
###### final code for json lines format for one board #######
import requests
from bs4 import BeautifulSoup
import re
import json
from langdetect import detect

def extract_details_from_post(post_div):
    text = post_div.get_text().strip()
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    tables = [str(table) for table in post_div.find_all('table')]
    links = [a['href'] for a in post_div.find_all('a', href=True)]
    return {
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None"),
        "table": clean_text(tables[0] if tables else "None"),
        "link": links[0] if links else "None"
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        lang = detect(text)
        return lang == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if not filter_valid_details(details):
                continue
            
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {"information zum forum": []}}

    for page_number in range(8):
        url = base_url + str(page_number * 20) + '.html'
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        threads_table = soup.find('table', {'class': 'table_grid'})
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = thread.find('td', class_='subject').find('a')['href']
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }

            forum_data["board"]["information zum forum"].append(thread_data)

    return forum_data

# The base URL for the forum board
base_url = "https://www.forum.diabetesinfo.de/forum/index.php/board,2."
forum_data = scrape_forum(base_url)

with open('forum_data.jsonl', 'w', encoding='utf-8') as file:
    for thread_data in forum_data["board"]["information zum forum"]:
        json_line = json.dumps(thread_data, ensure_ascii=False)
        file.write(json_line + '\n')

In [10]:
##########  FOR ALL BOARDS ##############
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

# Code from forum_boards.ipynb to extract forum boards and sub-boards
def get_forum_links(page_url):
    main_boards = []
    sub_boards = []

    # Fetching the HTML content from the URL
    response = requests.get(page_url)
    html_content = response.text

    # Parsing the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Finding all main boards (anchor tags with class "subject")
    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = board['href']
        main_boards.append((board_name, board_url))

    # Finding all sub-boards (anchor tags with the specific title pattern)
    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

# Code from crawling_forumposts.ipynb to scrape specific board page
def extract_details_from_post(post_div):
    text = post_div.get_text().strip()
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    tables = [str(table) for table in post_div.find_all('table')]
    links = [a['href'] for a in post_div.find_all('a', href=True)]
    return {
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None"),
        "table": clean_text(tables[0] if tables else "None"),
        "link": links[0] if links else "None"
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        lang = detect(text)
        return lang == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if not filter_valid_details(details):
                continue
            
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {"Externe Umfragen/Studien": []}}

    for page_number in range(6):
        url = base_url + str(page_number * 20) + '.html'
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        threads_table = soup.find('table', {'class': 'table_grid'})
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = thread.find('td', class_='subject').find('a')['href']
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }

            forum_data["board"]["Externe Umfragen/Studien"].append(thread_data)

    return forum_data


# Main code to iterate through boards and scrape data
page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Board Name_{i+1}: {board_name}, Link: {board_url}")

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    forum_data = scrape_forum(board_url)
    board_key = list(forum_data["board"].keys())[0]  # Get the first key in the "board" dictionary
    filename = os.path.join('output', f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for thread_data in forum_data["board"][board_key]:  # Use the correct key to access the data
            json_line = json.dumps(thread_data, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")


Board Name_1: Informationen zum Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,2.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_2: Externe Umfragen/Studien, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,86.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_3: Liste der Begriffe und Abkürzungen aus dem Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,15.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_4: Neues auf Diabetesinfo.de, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,23.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_5: Rezepte, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,17.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_6: Newsflash, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,46.0.html?PHPSESSID=01f1c8e10c676ad2b31e8217f2cd9953
Board Name_7: Allgemeiner Bereich, Link: https://www.forum.diabetesinfo.de/foru

In [2]:
######### Board Names and links ##########
from bs4 import BeautifulSoup
import requests

def get_forum_links(page_url):
    # Lists to store the main board and sub-board information
    main_boards = []
    sub_boards = []

    # Fetching the HTML content from the URL
    response = requests.get(page_url)
    html_content = response.text

    # Parsing the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Finding all main boards (anchor tags with class "subject")
    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = board['href']
        main_boards.append((board_name, board_url))

    # Finding all sub-boards (anchor tags with the specific title pattern)
    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Board Name_{i+1}: {board_name}, Link: {board_url}")


Board Name_1: Informationen zum Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,2.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_2: Externe Umfragen/Studien, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,86.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_3: Liste der Begriffe und Abkürzungen aus dem Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,15.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_4: Neues auf Diabetesinfo.de, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,23.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_5: Rezepte, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,17.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_6: Newsflash, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,46.0.html?PHPSESSID=e1f812f79679fe78be51b61be3f1b4d0
Board Name_7: Allgemeiner Bereich, Link: https://www.forum.diabetesinfo.de/foru

In [2]:
import json

def read_first_line_from_jsonl(file_path):
    with open(file_path, 'r') as f:
        first_line = f.readline().strip()
        return json.loads(first_line)

original = read_first_line_from_jsonl('/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/forum_data_7.jsonl')
cleaned = read_first_line_from_jsonl('/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/forum_data_7_cleaned.jsonl')

print("Original:")
print(json.dumps(original, indent=4))

print("Cleaned:")
print(json.dumps(cleaned, indent=4))


Original:
{
    "Betreff / Begonnen von": "Worum es in diesem Board geht",
    "text": "author post: Joerg Moeller Administrator Special Member Beitr\u00e4ge: 16865 Country: Ohana hei\u00dft \"Familie\"... Diabetestyp: DM 1 Therapie: Insulin-Pumpe Worum es in diesem Board geht \u00ab am: April 04, 2007, 13:07 \u00bb Hier im Forum tauchen immer wieder mal Begriffe und Zusammenh\u00e4nge auf, die nicht jeder kennt. Die k\u00f6nnte man dann erkl\u00e4ren, aber vielleicht tauchen in der Erkl\u00e4rung dann wieder neue unbekannte Begriffe auf.Hier soll das anders sein, denn hier sollen Dinge so erkl\u00e4rt werden, da\u00df wirklich jeder sie versteht. Dazu kann man ganz einfach Beispiele aus dem ganz normalen Alltag nehmen (so wie sie z.B. dem Gustl aus http://www.einsteiger.diabetesinfo.de/ pr\u00e4sentiert werden.Nat\u00fcrlich kann hier jeder seine Fragen stellen. Aber die sollten dann nicht pers\u00f6nlich bezogen sein (\"Warum ist das bei mir so und so\"), sondern eher allgemein; grun

In [3]:
import os
import shutil

def safe_move(src, dest):
    counter = 0
    new_dest = dest
    while os.path.exists(new_dest):
        counter += 1
        new_dest = "{}_{}".format(dest, counter)
    shutil.move(src, new_dest)

# Define the current directory (01_raw) containing both original and cleaned files
current_directory = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/'

# Define the subdirectories where you want to move the original and cleaned files
original_subdirectory = os.path.join(current_directory, '01_raw_original/')
cleaned_subdirectory = os.path.join(current_directory, '01_raw_cleaned/')

# Create the subdirectories if they don't exist
os.makedirs(original_subdirectory, exist_ok=True)
os.makedirs(cleaned_subdirectory, exist_ok=True)

# Loop through each file in the current directory (01_raw)
for filename in os.listdir(current_directory):
    if filename.endswith('.jsonl'):
        full_path = os.path.join(current_directory, filename)
        if '_cleaned' in filename:
            safe_move(full_path, os.path.join(cleaned_subdirectory, filename))
        else:
            safe_move(full_path, os.path.join(original_subdirectory, filename))


In [4]:
import os
import shutil

# Define the current directory (01_raw) containing both original and cleaned files
current_directory = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/'

# Define the subdirectories where you want to move the original and cleaned files
original_subdirectory = os.path.join(current_directory, '01_raw_original/')
cleaned_subdirectory = os.path.join(current_directory, '01_raw_cleaned/')

# Create the subdirectories if they don't exist
os.makedirs(original_subdirectory, exist_ok=True)
os.makedirs(cleaned_subdirectory, exist_ok=True)

# Initialize lists to hold the names of original and cleaned files
original_files = []
cleaned_files = []

# Categorize files
for filename in os.listdir(current_directory):
    if filename.endswith('.jsonl'):
        if '_cleaned' in filename:
            cleaned_files.append(filename)
        else:
            original_files.append(filename)

# Move the files to respective subdirectories
for filename in original_files:
    full_path = os.path.join(current_directory, filename)
    shutil.move(full_path, os.path.join(original_subdirectory, filename))
for filename in cleaned_files:
    full_path = os.path.join(current_directory, filename)
    shutil.move(full_path, os.path.join(cleaned_subdirectory, filename))


In [None]:
import os
import shutil

# Base directory containing all the JSONLine files
base_dir = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/'

# Subdirectories for original and cleaned files
original_dir = os.path.join(base_dir, '01_raw_original')
cleaned_dir = os.path.join(base_dir, '01_raw_cleaned')

# Create the subdirectories if they don't exist
if not os.path.exists(original_dir):
    os.makedirs(original_dir)
    
if not os.path.exists(cleaned_dir):
    os.makedirs(cleaned_dir)

# List all files in the base directory
all_files = [f for f in os.listdir(base_dir) if os.path.isfile(os.path.join(base_dir, f))]

# Move files to the appropriate subdirectories
for filename in all_files:
    if filename.endswith('.jsonl'):
        src_path = os.path.join(base_dir, filename)
        
        # Check if the file name contains 'cleaned'
        if 'cleaned' in filename:
            dest_path = os.path.join(cleaned_dir, filename)
        else:
            dest_path = os.path.join(original_dir, filename)
        
        # Move the file
        shutil.move(src_path, dest_path)


In [1]:
import os
import shutil

# Define the directories
original_dir = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_original'
cleaned_dir = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned'

# Create the cleaned_dir if it doesn't exist
if not os.path.exists(cleaned_dir):
    os.makedirs(cleaned_dir)

# List all files in original_dir
all_files = [f for f in os.listdir(original_dir) if os.path.isfile(os.path.join(original_dir, f))]

# Move cleaned files to cleaned_dir
for filename in all_files:
    if 'cleaned' in filename and filename.endswith('.jsonl'):
        src_path = os.path.join(original_dir, filename)
        dest_path = os.path.join(cleaned_dir, filename)
        
        shutil.move(src_path, dest_path)
        print(f"Moved {filename} to {cleaned_dir}")


Moved forum_data_0_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_16_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_37_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_21_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_4_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_34_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_25_cleaned.jsonl to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned
Moved forum_data_13_cleaned.jsonl to /data/share/project/smart_hospital/medical_datas

In [2]:
import shutil
import os

# Define the directory to delete
dir_to_delete = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_original'

# Check if the directory exists
if os.path.exists(dir_to_delete):
    # Delete the directory and all its contents
    shutil.rmtree(dir_to_delete)
    print(f"Directory {dir_to_delete} has been deleted.")
else:
    print(f"Directory {dir_to_delete} does not exist.")


Directory /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_original has been deleted.


In [3]:
import os

# Define the current directory name and the new directory name
current_dir_name = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned'
new_dir_name = '/data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/cleaned_data'

# Check if the current directory exists
if os.path.exists(current_dir_name):
    # Rename the directory
    os.rename(current_dir_name, new_dir_name)
    print(f"Directory {current_dir_name} has been renamed to {new_dir_name}.")
else:
    print(f"Directory {current_dir_name} does not exist.")

Directory /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/01_raw_cleaned has been renamed to /data/share/project/smart_hospital/medical_dataset/forumposts_jsonfiles/01_raw/cleaned_data.


In [2]:
# save author post in "text" for a single file

import json

input_file_path = "/home/IAIS/jdatta/output/forum_data_0.jsonl"
output_file_path = "/home/IAIS/jdatta/output_cleaned/output_0.jsonl"

def process_line(line):
    data = json.loads(line)
    
    # Extract "antwort" keys and their corresponding "text" values
    antwort_texts = []
    for key, value in data["antwort"].items():
        if "text" in value:
            antwort_texts.append(value["text"])
    
    reformatted_data = {
        "Betreff / Begonnen von": data["Betreff / Begonnen von"],
        "text": data["author_post"]["text"],
        "antwort_texts": antwort_texts,  # List of "antwort" text values
        "Antworten / Aufrufe": data["Antworten / aufrufe"],
        "Letzter_Beitrag": data["Letzter_Beitrag"]
    }
    return reformatted_data

with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        reformatted_data = process_line(line)
        json.dump(reformatted_data, output_file, ensure_ascii=False)
        output_file.write('\n')

In [3]:
# save author post in "text" for all files
import os
import json

input_dir = '/home/IAIS/jdatta/output'
output_dir = '/home/IAIS/jdatta/output_cleaned'


def process_line(line):
    data = json.loads(line)
    
    # Skip processing if "author_post" is null
    if data.get("author_post") is None:
        return None
    
    antwort_texts = []
    if "antwort" in data and isinstance(data["antwort"], dict):
        for key, value in data["antwort"].items():
            if isinstance(value, dict) and "text" in value:
                antwort_texts.append(value["text"])
    
    author_post = data.get("author_post", {})
    author_post_text = author_post.get("text", "")

    reformatted_data = {
        "Betreff / Begonnen von": data.get("Betreff / Begonnen von", ""),
        "text": author_post_text,
        "antwort_texts": antwort_texts,
        "Antworten / Aufrufe": data.get("Antworten / Aufrufe", ""),
        "Letzter_Beitrag": data.get("Letzter_Beitrag", "")
    }
    return reformatted_data

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each JSONL file in the input directory
for input_filename in os.listdir(input_dir):
    if input_filename.endswith(".jsonl"):
        input_file_path = os.path.join(input_dir, input_filename)
        output_file_path = os.path.join(output_dir, input_filename)

        with open(output_file_path, 'w') as output_file:
            with open(input_file_path, 'r') as input_file:
                for line in input_file:
                    reformatted_data = process_line(line)
                    if reformatted_data is not None:
                        json.dump(reformatted_data, output_file, ensure_ascii=False)
                        output_file.write('\n')

            print(f"Processing complete for {input_filename}")

print("All JSONL files processed.")

Processing complete for forum_data_9.jsonl
Processing complete for forum_data_24.jsonl
Processing complete for forum_data_1.jsonl
Processing complete for forum_data_33.jsonl
Processing complete for forum_data_18.jsonl
Processing complete for forum_data_34.jsonl
Processing complete for forum_data_19.jsonl
Processing complete for forum_data_6.jsonl
Processing complete for forum_data_22.jsonl
Processing complete for forum_data_3.jsonl
Processing complete for forum_data_15.jsonl
Processing complete for forum_data_28.jsonl
Processing complete for forum_data_35.jsonl
Processing complete for forum_data_11.jsonl
Processing complete for forum_data_5.jsonl
Processing complete for forum_data_36.jsonl
Processing complete for forum_data_21.jsonl
Processing complete for forum_data_13.jsonl
Processing complete for forum_data_16.jsonl
Processing complete for forum_data_10.jsonl
Processing complete for forum_data_4.jsonl
Processing complete for forum_data_14.jsonl
Processing complete for forum_data_0.j

In [4]:
# Merge text of all files
import os
import json
from langdetect import detect

# Input and output directories
input_dir = "/home/IAIS/jdatta/output_cleaned"
output_dir = "/home/IAIS/jdatta/output_cleaned/answers_merged"



# Function to detect the language of a text
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# Function to process a single JSONL line and modify it
def process_line(data):
    # Check if the "text" key exists in the data
    if "text" in data:
        text = data["text"]
        antwort_texts = data.get("antwort_texts", [])
        
        # Convert the list of "antwort_texts" into a single string
        antwort_text_string = " ".join(["Antwort: " + item for item in antwort_texts])

        # Concatenate "author post:" and the modified "antwort_texts" string to the "text"
        data["text"] = "Forumpost: " + text + " " + antwort_text_string

        # Remove the "antwort_texts" field
        del data["antwort_texts"]

        # Check the language of the merged text
        text_language = detect_language(data["text"])

        # If the language is not German, return None to exclude this line
        if text_language != "de":
            return None

    return data


# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each JSONL file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".jsonl"):
        input_file_path = os.path.join(input_dir, filename)
        output_file_path = os.path.join(output_dir, filename)

        with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, "w", encoding="utf-8") as output_file:
            for line in input_file:
                data = json.loads(line)
                processed_data = process_line(data)

                # Only write the line if it's not None (i.e., it passed the language check)
                if processed_data is not None:
                    output_file.write(json.dumps(processed_data, ensure_ascii=False) + "\n")

print("Conversion completed. Output files are in", output_dir)

Conversion completed. Output files are in /home/IAIS/jdatta/output_cleaned/answers_merged


In [1]:
import json

# Path to your JSONL file
jsonl_file_path = '/home/IAIS/jdatta/forumposts/text_normalization/dataset.jsonl'

try:
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f, start=1):  # Start counting from line 1
            try:
                # Parse JSON from each line
                parsed_json = json.loads(line)
                
                # Perform some basic operations (replace with your own logic)
                if 'text' in parsed_json:
                    text_length = len(parsed_json['text'])
                
                print(f"Successfully parsed line {i}, Text Length: {text_length}")
            
            except json.JSONDecodeError:
                print(f"Error decoding JSON on line {i}")
            
            except Exception as e:
                print(f"An unexpected error occurred on line {i}: {e}")

except FileNotFoundError:
    print(f"The file {jsonl_file_path} was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully parsed line 1, Text Length: 3072
Successfully parsed line 2, Text Length: 1581
Successfully parsed line 3, Text Length: 1546
Successfully parsed line 4, Text Length: 1198
Successfully parsed line 5, Text Length: 80854
Successfully parsed line 6, Text Length: 3103
Successfully parsed line 7, Text Length: 1436
Successfully parsed line 8, Text Length: 1857
Successfully parsed line 9, Text Length: 6445
Successfully parsed line 10, Text Length: 2371
Successfully parsed line 11, Text Length: 7923
Successfully parsed line 12, Text Length: 6665
Successfully parsed line 13, Text Length: 27000
Successfully parsed line 14, Text Length: 1642
Successfully parsed line 15, Text Length: 11676
Successfully parsed line 16, Text Length: 5987
Successfully parsed line 17, Text Length: 1976
Successfully parsed line 18, Text Length: 2444
Successfully parsed line 19, Text Length: 1644
Successfully parsed line 20, Text Length: 6749
Successfully parsed line 21, Text Length: 3072
Successfully parsed

In [2]:
# Initialize counters and other variables outside the loop
word_count = 0
missing_fields = 0

# Inside the loop
try:
    # Parse JSON from each line
    parsed_json = json.loads(line)

    # Count the number of words in the 'text' field
    if 'text' in parsed_json:
        text_length = len(parsed_json['text'])
        word_count += len(parsed_json['text'].split())
    else:
        missing_fields += 1

    print(f"Successfully parsed line {i}, Text Length: {text_length}, Word Count: {word_count}")

except json.JSONDecodeError:
    print(f"Error decoding JSON on line {i}")

# After the loop
print(f"Total Word Count: {word_count}")
print(f"Total Missing Fields: {missing_fields}")


Successfully parsed line 3925, Text Length: 292, Word Count: 45
Total Word Count: 45
Total Missing Fields: 0


In [5]:
def copy_specific_line_from_jsonl(file_path, line_number, output_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
            # Enumerate through the lines in the file, starting from 1
            for i, line in enumerate(infile, start=1):
                # Check if the current line is the one to be copied
                if i == line_number:
                    outfile.write(line)
                    print(f"Copied line {line_number} to {output_path}")
                    return
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Usage example
copy_specific_line_from_jsonl('/home/IAIS/jdatta/forumposts/text_normalization/output_normalized_file.jsonl', 1731, '/home/IAIS/jdatta/forumposts/text_normalization/specific_line.jsonl')


Copied line 1731 to /home/IAIS/jdatta/forumposts/text_normalization/specific_line.jsonl


In [1]:
import json
import unicodedata

def normalize_text(text):
    try:
        return ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
        )
    except Exception as e:
        print(f"An error occurred while normalizing text: {e}")
        return text  # Return the original text if normalization fails

# Path to your input and output JSONL files
input_jsonl_path = '/home/IAIS/jdatta/forumposts/text_normalization/output_normalized_file.jsonl'
output_jsonl_path = '/home/IAIS/jdatta/dataset/normalized.jsonl'

try:
    line_count = 0
    processed_count = 0
    
    with open(input_jsonl_path, 'r', encoding='utf-8') as infile, open(output_jsonl_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            line_count += 1
            
            try:
                # Parse the JSON object from each line
                json_obj = json.loads(line)
                
                # Normalize the text in the 'text' field (change this to the field you wish to normalize)
                if 'text' in json_obj:
                    json_obj['text'] = normalize_text(json_obj['text'])
                
                # Write the modified JSON object back to the output file
                outfile.write(json.dumps(json_obj) + '\n')
                processed_count += 1
                
            except json.JSONDecodeError:
                print(f"JSON decode error on line {line_count}")
                
            except Exception as e:
                print(f"An unexpected error occurred on line {line_count}: {e}")
                
    print(f"Processed {processed_count} lines out of {line_count} lines.")
    
except FileNotFoundError:
    print(f"The file {input_jsonl_path} was not found.")
    
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Processed 3922 lines out of 3922 lines.


In [1]:
import json

def validate_jsonl(file_path):
    with open(file_path, 'r') as f:
        line_number = 0
        for line in f:
            line_number += 1
            try:
                json.loads(line.strip())
            except json.JSONDecodeError as e:
                return False, f"Invalid JSON on line {line_number}: {e}"
    return True, "File is valid"

# Usage
is_valid, message = validate_jsonl("/home/IAIS/jdatta/forumposts/text_normalization/output_normalized_file.jsonl")
if is_valid:
    print("The .jsonl file is valid.")
else:
    print(f"Validation failed: {message}")


The .jsonl file is valid.


In [1]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

def get_forum_links(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    main_boards = []
    sub_boards = []

    # Finding main boards
    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text.strip()
        board_url = board['href']
        main_boards.append((board_name, board_url))
    
    # Finding sub boards (if any)
    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text.strip()}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

def extract_details_from_post(post_div):
    # Extracting text only from the 'inner' div as suggested
    inner_div = post_div.find("div", class_="inner")
    if inner_div:
        text = " ".join(s.strip() for s in inner_div.strings)
        text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
        return {
            "text": clean_text(text)
        }
    return {"text": ""}

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        return detect(text) == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, responses = None, []
    current_page_number = 1

    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')
        posts_divs = soup.find_all('div', class_='post_wrapper')

        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if filter_valid_details(details):
                if index == 0 and current_page_number == 1:
                    author_post = details
                else:
                    responses.append(details)

        # Handling pagination
        page_links_div = soup.find('div', class_='pagelinks floatleft')
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', class_='navPages') if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, responses

def scrape_forum(base_url):
    forum_data = {"board": {"Externe Umfragen/Studien": []}}
    for page_number in range(6):
        url = f"{base_url}{page_number * 20}.html"
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')
        threads_table = soup.find('table', class_='table_grid')
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            subject_cell = thread.find('td', class_='subject')
            title = subject_cell.find('a').text.split('Begonnen')[0].strip()
            stats = clean_text(thread.find('td', class_='stats').text)
            last_post = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = subject_cell.find('a')['href']
            author_post, responses = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "responses": {f"response_{i+1}": resp for i, resp in enumerate(responses)},
                "Antworten / Aufrufe": stats,
                "Letzter Beitrag": last_post
            }
            forum_data["board"]["Externe Umfragen/Studien"].append(thread_data)

    return forum_data

# Example use case
page_url = 'https://www.forum.diabetesinfo.de/forum/index.php/board,2.'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name} at {board_url}...")
    forum_data = scrape_forum(board_url)
    filename = os.path.join('output', f'forum_data_{i}.json')
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(forum_data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

print("Scraping completed!")


Scraping Forum-Treff at https://www.forum.diabetesinfo.de/forum/index.php/board,42.0.html?PHPSESSID=06855fa8593aced48677332349cdecc1...
Data saved to output/forum_data_0.json
Scraping completed!


In [4]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

# Define a function to get links to both main and sub-forums
def get_forum_links(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    main_boards = []
    sub_boards = []

    # Finding main boards
    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text.strip()
        board_url = board['href']
        main_boards.append((board_name, board_url))
    
    # Finding sub boards (if any)
    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text.strip()}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

# Define a function to extract post details from a div element
def extract_details_from_post(post_div):
    inner_div = post_div.find("div", class_="inner")
    if inner_div:
        text = " ".join(s.strip() for s in inner_div.strings)
        text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
        return {
            "text": clean_text(text)
        }
    return {"text": ""}

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        return detect(text) == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    thread_id = url.split('=')[-1] 
    author_post, responses = None, []
    current_page_number = 1

    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')
        posts_divs = soup.find_all('div', class_='post_wrapper')

        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if filter_valid_details(details):
                details['thread_id'] = thread_id
                if index == 0 and current_page_number == 1:
                    author_post = details
                else:
                    responses.append(details)

        # Handling pagination
        page_links_div = soup.find('div', class_='pagelinks floatleft')
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', class_='navPages') if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, responses

# Define a function to scrape a forum based on a base URL, collecting data from threads
def scrape_forum(base_url):
    forum_data = {"board": {"Externe Umfragen/Studien": []}}
    for page_number in range(6):
        url = f"{base_url}{page_number * 20}.html"
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')
        threads_table = soup.find('table', class_='table_grid')
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            subject_cell = thread.find('td', class_='subject')
            title = subject_cell.find('a').text.split('Begonnen')[0].strip()
            stats = clean_text(thread.find('td', class_='stats').text)
            last_post = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = subject_cell.find('a')['href']
            author_post, responses = scrape_thread(thread_url)

            # Include author post and responses as separate entries
            if author_post:
                author_post.update({
                    "post_type": "author_post",
                    "Betreff / Begonnen von": title,
                    "Antworten / Aufrufe": stats,
                    "Letzter Beitrag": last_post
                })
                forum_data["board"]["Externe Umfragen/Studien"].append(author_post)

            for response in responses:
                response.update({
                    "post_type": "response",
                    "Betreff / Begonnen von": title,
                    "Antworten / Aufrufe": stats,
                    "Letzter Beitrag": last_post
                })
                forum_data["board"]["Externe Umfragen/Studien"].append(response)

    return forum_data

# Example use case
page_url = 'https://www.forum.diabetesinfo.de/forum/index.php/board,15.'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name} at {board_url}...")
    forum_data = scrape_forum(board_url)
    # Ensure the output directory exists
    os.makedirs('output', exist_ok=True)
    filename = os.path.join('output', f'forum_data_{i}.json')
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(forum_data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

print("Scraping completed!")


Scraping completed!


In [6]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

def get_forum_links(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    main_boards = [(board.text, board['href']) for board in soup.find_all('a', class_='subject')]
    sub_boards = [("Untergeordnete_Boards_{}: {}".format(i+1, board.text), board['href']) 
                  for i, board in enumerate(soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x))]
    return main_boards + sub_boards

def extract_details_from_post(post_div, thread_id):
    text_elements = post_div.find("div", class_="inner")
    text = " ".join([s.strip() for s in text_elements.stripped_strings if s.strip()]) if text_elements else ""
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    return {
        "thread_id": thread_id,
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None")
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        return detect(text) == 'de'
    except:
        return False

def filter_valid_details(details):
    return len(details['text']) >= 20 and is_german(details['text'])

def scrape_thread(url):
    thread_id = url.split('/')[-1]  # Assuming the URL contains the thread ID at the end
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div, thread_id)
            if not filter_valid_details(details):
                continue
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
        url = next_page_links[0]['href'] if next_page_links else None
        current_page_number += 1
    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {}}
    for page_number in range(6):
        url = f"{base_url}{page_number * 20}.html"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        threads = soup.find_all('tr')[1:]
        for thread in threads:
            thread_url = thread.find('td', class_='subject').find('a')['href']
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }
            if title not in forum_data["board"]:
                forum_data["board"][title] = []
            forum_data["board"][title].append(thread_data)
    return forum_data

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php/board,2.'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Board Name_{i+1}: {board_name}, Link: {board_url}")

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    forum_data = scrape_forum(board_url)
    board_key = list(forum_data["board"].keys())[0] 
    filename = os.path.join('output', f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for thread_data in forum_data["board"][board_key]: 
            json_line = json.dumps(thread_data, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")


Board Name_1: Forum-Treff, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,42.0.html?PHPSESSID=15ccfed1fe228611d638916510997c7d
Scraping Forum-Treff...


AttributeError: 'NoneType' object has no attribute 'find'

In [1]:
########################################### Main ######################################

from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

def get_forum_links(page_url):
    main_boards = []
    sub_boards = []

    response = requests.get(page_url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, 'html.parser')

    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = board['href']
        main_boards.append((board_name, board_url))

    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

def extract_details_from_post(post_div):
    text = post_div.get_text().strip()
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)
    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    tables = [str(table) for table in post_div.find_all('table')]
    links = [a['href'] for a in post_div.find_all('a', href=True)]
    return {
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None"),
        "table": clean_text(tables[0] if tables else "None"),
        "link": links[0] if links else "None"
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        lang = detect(text)
        return lang == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if not filter_valid_details(details):
                continue
            
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {"Externe Umfragen/Studien": []}}

    for page_number in range(6):
        url = base_url + str(page_number * 20) + '.html'
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        threads_table = soup.find('table', {'class': 'table_grid'})
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = thread.find('td', class_='subject').find('a')['href']
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }

            forum_data["board"]["Externe Umfragen/Studien"].append(thread_data)

    return forum_data

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Board Name_{i+1}: {board_name}, Link: {board_url}")

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    forum_data = scrape_forum(board_url)
    board_key = list(forum_data["board"].keys())[0] 
    filename = os.path.join('output', f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for thread_data in forum_data["board"][board_key]: 
            json_line = json.dumps(thread_data, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")

Board Name_1: Informationen zum Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,2.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_2: Externe Umfragen/Studien, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,86.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_3: Liste der Begriffe und Abkürzungen aus dem Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,15.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_4: Neues auf Diabetesinfo.de, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,23.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_5: Rezepte, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,17.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_6: Newsflash, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,46.0.html?PHPSESSID=e11c6a2fda331bdcf5f7f4bf8727074f
Board Name_7: Allgemeiner Bereich, Link: https://www.forum.diabetesinfo.de/foru

KeyboardInterrupt: 

In [8]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os

def get_forum_links(page_url):
    main_boards = []
    sub_boards = []

    response = requests.get(page_url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, 'html.parser')

    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = board['href']
        main_boards.append((board_name, board_url))

    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = board['href']
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

def extract_details_from_post(post_div):
    text_elements = post_div.find_all("div", class_="inner")
    text = " ".join([s.strip() for s in text_elements[0].strings]) if text_elements else ""
    text = re.sub(r'« Antwort #\d+ am:.*?»', '', text)

    quotes = [quote.get_text().strip() for quote in post_div.find_all('blockquote')]
    links = [a['href'] for a in post_div.find_all('a', href=True)]
    
    return {
        "text": clean_text(text),
        "quote": clean_text(quotes[0] if quotes else "None"),
        "link": links[0] if links else "None"
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        lang = detect(text)
        return lang == 'de'
    except:
        return False

def filter_valid_details(details):
    text = details['text']
    if len(text) < 20 or not is_german(text):
        return False
    return True

def scrape_thread(url):
    author_post, antworts = None, []
    current_page_number = 1
    while url:
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        posts_divs = soup.find_all('div', class_='post_wrapper')
        for index, post_div in enumerate(posts_divs):
            details = extract_details_from_post(post_div)
            if not filter_valid_details(details):
                continue
            
            if index == 0 and current_page_number == 1:
                author_post = details
            else:
                antworts.append(details)

        page_links_div = soup.find('div', {'class': 'pagelinks floatleft'})
        if page_links_div:
            next_page_links = [a for a in page_links_div.find_all('a', {'class': 'navPages'}) if a.text.strip() == str(current_page_number + 1)]
            url = next_page_links[0]['href'] if next_page_links else None
            current_page_number += 1
        else:
            url = None

    return author_post, antworts

def scrape_forum(base_url):
    forum_data = {"board": {"Externe Umfragen/Studien": []}}

    for page_number in range(6):
        url = base_url + str(page_number * 20) + '.html'
        response = requests.get(url)
        response.encoding = 'ISO-8859-1'
        soup = BeautifulSoup(response.text, 'html.parser')

        threads_table = soup.find('table', {'class': 'table_grid'})
        threads = threads_table.find_all('tr')[1:]

        for thread in threads:
            title = thread.find('td', class_='subject').find('a').text.split('Begonnen')[0].strip()
            antworten_aufrufe = clean_text(thread.find('td', class_='stats').text)
            letzter_beitrag = clean_text(thread.find('td', class_='lastpost').text)
            thread_url = thread.find('td', class_='subject').find('a')['href']
            author_post, antworts = scrape_thread(thread_url)

            thread_data = {
                "Betreff / Begonnen von": title,
                "author_post": author_post,
                "antwort": {f"antwort_{i+1}": antwort for i, antwort in enumerate(antworts)},
                "Antworten / aufrufe": antworten_aufrufe,
                "Letzter_Beitrag": letzter_beitrag
            }

            forum_data["board"]["Externe Umfragen/Studien"].append(thread_data)

    return forum_data

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
for i, (board_name, board_url) in enumerate(board_links):
    print(f"Board Name_{i+1}: {board_name}, Link: {board_url}")

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    forum_data = scrape_forum(board_url)
    board_key = list(forum_data["board"].keys())[0] 
    filename = os.path.join('output', f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for thread_data in forum_data["board"][board_key]: 
            json_line = json.dumps(thread_data, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")


Board Name_1: Informationen zum Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,2.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_2: Externe Umfragen/Studien, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,86.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_3: Liste der Begriffe und Abkürzungen aus dem Forum, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,15.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_4: Neues auf Diabetesinfo.de, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,23.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_5: Rezepte, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,17.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_6: Newsflash, Link: https://www.forum.diabetesinfo.de/forum/index.php/board,46.0.html?PHPSESSID=0345c2f7d3a33b0ce7f3fd368769848a
Board Name_7: Allgemeiner Bereich, Link: https://www.forum.diabetesinfo.de/foru

KeyboardInterrupt: 

In [10]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os
from urllib.parse import urljoin

def get_forum_links(page_url):
    main_boards = []
    sub_boards = []

    response = requests.get(page_url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, 'html.parser')

    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = urljoin(page_url, board['href'])
        main_boards.append((board_name, board_url))

    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = urljoin(page_url, board['href'])
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

def extract_details_from_post(post_div, thread_url):
    text_elements = post_div.find_all("div", class_="inner")
    text = " ".join(s.strip() for s in text_elements[0].strings) if text_elements else ""
    return {
        "text": clean_text(text),
        "thread_url": thread_url
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        return detect(text) == 'de'
    except:
        return False

def filter_valid_details(details):
    return len(details['text']) >= 20 and is_german(details['text'])

def scrape_thread(url):
    posts = []
    while url:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser', from_encoding='ISO-8859-1')
        posts_divs = soup.find_all('div', class_='post_wrapper')
        posts.extend(extract_details_from_post(div, url) for div in posts_divs if filter_valid_details(extract_details_from_post(div, url)))
        next_page_link = soup.find('a', string=str(len(posts) // 20 + 1))
        url = urljoin(url, next_page_link['href']) if next_page_link else None
    return posts

def scrape_forum(base_url):
    posts = []
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser', from_encoding='ISO-8859-1')
    threads = [urljoin(base_url, tr.find('a')['href']) for tr in soup.find_all('tr')[1:] if tr.find('a')]
    for thread_url in threads:
        posts.extend(scrape_thread(thread_url))
    return posts

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    posts = scrape_forum(board_url)
    filename = os.path.join(output_folder, f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for post in posts:
            json_line = json.dumps(post, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")

Scraping Informationen zum Forum...
Data saved to output/forum_data_0.jsonl
Scraping Externe Umfragen/Studien...
Data saved to output/forum_data_1.jsonl
Scraping Liste der Begriffe und Abkürzungen aus dem Forum...
Data saved to output/forum_data_2.jsonl
Scraping Neues auf Diabetesinfo.de...


KeyboardInterrupt: 

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import json
from langdetect import detect
import os
from urllib.parse import urljoin

def get_forum_links(page_url):
    main_boards = []
    sub_boards = []

    response = requests.get(page_url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, 'html.parser')

    boards = soup.find_all('a', class_='subject')
    for board in boards:
        board_name = board.text
        board_url = urljoin(page_url, board['href'])
        main_boards.append((board_name, board_url))

    sub_boards_elements = soup.find_all('a', title=lambda x: x and "Keine neuen Beiträge" in x)
    for i, board in enumerate(sub_boards_elements):
        board_name = f"Untergeordnete_Boards_{i+1}: {board.text}"
        board_url = urljoin(page_url, board['href'])
        sub_boards.append((board_name, board_url))

    return main_boards + sub_boards

def extract_details_from_post(post_div, thread_url):
    text_elements = post_div.find_all("div", class_="inner")
    text = " ".join(s.strip() for s in text_elements[0].strings) if text_elements else ""
    return {
        "text": clean_text(text),
        "thread_url": thread_url
    }

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def is_german(text):
    try:
        return detect(text) == 'de'
    except:
        return False

def filter_valid_details(details):
    return len(details['text']) >= 20 and is_german(details['text'])

def scrape_thread(url):
    posts = []
    page_number = 1
    while url:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser', from_encoding='ISO-8859-1')
        posts_divs = soup.find_all('div', class_='post_wrapper')
        posts.extend(extract_details_from_post(div, url) for div in posts_divs if filter_valid_details(extract_details_from_post(div, url)))
        next_page_link = soup.find('a', text=str(page_number + 1))
        url = urljoin(url, next_page_link['href']) if next_page_link else None
        page_number += 1
    return posts

def scrape_forum(base_url):
    posts = []
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser', from_encoding='ISO-8859-1')
    threads = [urljoin(base_url, tr.find('a')['href']) for tr in soup.find_all('tr')[1:] if tr.find('a')]
    for thread_url in threads:
        posts.extend(scrape_thread(thread_url))
    return posts

page_url = 'https://www.forum.diabetesinfo.de/forum/index.php'
board_links = get_forum_links(page_url)
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)

for i, (board_name, board_url) in enumerate(board_links):
    print(f"Scraping {board_name}...")
    posts = scrape_forum(board_url)
    filename = os.path.join(output_folder, f'forum_data_{i}.jsonl')
    with open(filename, 'w', encoding='utf-8') as file:
        for post in posts:
            json_line = json.dumps(post, ensure_ascii=False)
            file.write(json_line + '\n')
    print(f"Data saved to {filename}")

print("Scraping completed!")

Scraping Informationen zum Forum...


  next_page_link = soup.find('a', text=str(page_number + 1))


Data saved to output/forum_data_0.jsonl
Scraping Externe Umfragen/Studien...


KeyboardInterrupt: 