In [4]:
import json
import os

# Define the directories and file names
doc_check_dir = '/data/share/project/smart_hospital/medical_dataset/doc_check/01_raw'
raw_dir = '/data/share/project/smart_hospital/medical_dataset/doc_check/01_raw'

json_file_path = os.path.join(doc_check_dir, 'dataset_doccheck.json')
jsonl_file_path = os.path.join(raw_dir, 'doc_check.jsonl')

# Read the JSON file
with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

# Write the JSONL file
with open(jsonl_file_path, 'w') as jsonl_file:
    for item in json_data:
        jsonl_file.write(json.dumps(item) + '\n')

print(f"Converted {json_file_path} to {jsonl_file_path}.")


Converted /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/dataset_doccheck.json to /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/doc_check.jsonl.


In [3]:
######### doc_check file converted from json to jsonl ###########
import json
import os

# Define the directories and file names
doc_check_dir = '/data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original'
raw_dir = '/data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original'

# Create the directories if they don't exist
os.makedirs(doc_check_dir, exist_ok=True)
os.makedirs(raw_dir, exist_ok=True)

json_file_path = os.path.join(doc_check_dir, 'dataset_doccheck.json')
jsonl_file_path = os.path.join(raw_dir, 'doc_check.jsonl')

# Initialize counter for written items
written_items = 0

try:
    # Read the JSON file
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)
    print(f"Read items from {json_file_path}.")
    
    # Validate that json_data is a dictionary
    if not isinstance(json_data, dict):
        print("The JSON data is not a dictionary. Exiting.")
        exit(1)

    # Write the JSONL file
    with open(jsonl_file_path, 'w') as jsonl_file:
        for key, value in json_data.items():
            # Remove all white-spaces from the value
            value_no_whitespace = ''.join(value.split())
            
            # Transform the data into the desired format
            json_object = {"text": value_no_whitespace, "id": key}
            
            # Write the transformed JSON object to the JSONL file
            jsonl_file.write(json.dumps(json_object) + '\n')
            written_items += 1
    
    print(f"Converted {json_file_path} to {jsonl_file_path}.")
    print(f"Wrote {written_items} items to {jsonl_file_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except PermissionError as e:
    print(f"Permission error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Read items from /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/dataset_doccheck.json.
Converted /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/dataset_doccheck.json to /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/doc_check.jsonl.
Wrote 13136 items to /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/doc_check.jsonl.


In [1]:
import os
import json

def convert_json_to_jsonl(json_dir, output_jsonl_path):
    """
    Convert multiple JSON files in a directory to a single JSONL file.

    Parameters:
        json_dir (str): The directory containing the JSON files to convert.
        output_jsonl_path (str): The path where the output JSONL file will be saved.
    """
    # Ensure the provided directory exists
    if not os.path.exists(json_dir):
        print(f"The directory {json_dir} does not exist.")
        return
    
    # Initialize an empty list to store the JSONL lines
    jsonl_lines = []
    
    # Iterate through each file in the directory
    for filename in os.listdir(json_dir):
        # Only process files with a .json extension
        if filename.endswith('.json'):
            filepath = os.path.join(json_dir, filename)
            
            # Read the JSON file
            with open(filepath, 'r') as f:
                json_data = json.load(f)
            
            # Convert each key-value pair to a JSONL-compatible line
            for key, value in json_data.items():
                jsonl_line = json.dumps({"text": value, "id": key})  # Use 'value' directly
                jsonl_lines.append(jsonl_line)
                
    # Write the JSONL lines to the output file
    with open(output_jsonl_path, 'w') as f:
        for line in jsonl_lines:
            f.write(line + '\n')

# Directory and output path
json_dir = '/data/share/project/smart_hospital/medical_dataset/springer_jsons/01_raw/original/'
output_jsonl_path = '/data/share/project/smart_hospital/medical_dataset/springer_jsons/01_raw/Springer_dataset.jsonl'

# Call the conversion function
convert_json_to_jsonl(json_dir, output_jsonl_path)


In [6]:
import json

# Function to convert a text with multiple paragraphs to a single JSONL file
def convert_text_to_jsonl(text, output_jsonl_path):
    """
    Convert a text with multiple paragraphs to a single JSONL file.

    Parameters:
        text (str): The text containing multiple paragraphs.
        output_jsonl_path (str): The path where the output JSONL file will be saved.
    """
    # Split the text into paragraphs based on line breaks
    paragraphs = text.split('\n')
    
    # Initialize an empty list to store the JSONL lines
    jsonl_lines = []
    
    # Convert each paragraph to a JSONL-compatible line
    for idx, paragraph in enumerate(paragraphs):
        # Remove whitespace
        paragraph_no_whitespace = ''.join(paragraph.split())
        # Create a JSONL line
        jsonl_line = json.dumps({"text": paragraph_no_whitespace, "id": str(idx + 1)})
        jsonl_lines.append(jsonl_line)
    
    # Combine JSONL lines into a single JSONL string
    jsonl_content = '\n'.join(jsonl_lines)
    
    # Save the JSONL content to the specified file
    with open(output_jsonl_path, 'w') as f:
        f.write(jsonl_content)

# Sample usage
text = """Your sample text here separated by line breaks for each paragraph."""

output_jsonl_path = '/data/share/project/smart_hospital/medical_dataset/kres/01_raw/kres_dataset.jsonl'

convert_text_to_jsonl(text, output_jsonl_path)

Successfully converted the text to JSONL format. The output file is saved at /data/share/project/smart_hospital/medical_dataset/kres/01_raw/kres_dataset.jsonl.


In [8]:
import json

# Function to read text from a file
def read_text_from_file(input_file_path):
    with open(input_file_path, 'r') as f:
        return f.read()

# Function to convert a text with multiple paragraphs to a single JSONL file
def convert_text_to_jsonl(text, output_jsonl_path):
    paragraphs = text.split('\n')
    jsonl_lines = []
    for idx, paragraph in enumerate(paragraphs):
        jsonl_line = json.dumps({"text": paragraph, "id": str(idx + 1)})
        jsonl_lines.append(jsonl_line)
    
    jsonl_content = '\n'.join(jsonl_lines)
    
    with open(output_jsonl_path, 'w') as f:
        f.write(jsonl_content)

input_file_path = '/data/share/project/smart_hospital/medical_dataset/kres/01_raw/khresmoi-summary-dev.de'

output_jsonl_path = '/data/share/project/smart_hospital/medical_dataset/kres/01_raw/kres_dataset.jsonl'

text = read_text_from_file(input_file_path)

# Convert the text to JSONL and save it
convert_text_to_jsonl(text, output_jsonl_path)

In [9]:
n = 10  # Number of lines to read
with open('/data/share/project/smart_hospital/medical_dataset/kres/01_raw/kres_dataset.jsonl', 'r') as f:
    for i, line in enumerate(f):
        if i >= n:
            break
        print(line)



{"text": "BeiderTyp-2-Infektion(einerGangr\u00e4ndurchh\u00e4molysierendeStreptokokken)tretendieGruppe-A-Streptokokken(GAS)isoliertoderinKombinationmitanderenSpezies,\u00fcblicherweiseinVerbindungmitdemStaphylococcusaureus,auf.", "id": "1"}

{"text": "DieverwendeteStrahlungsmengef\u00fcreinenR\u00f6ntgen-Thoraxistsehrgering.", "id": "2"}

{"text": "DieMeningokokken-ErkrankungisteineernsthafteBakterieninfektion,diezuSchwellungenimGehirnundR\u00fcckenmarksowiezurEntz\u00fcndungdesBlutsundandererOrganef\u00fchrenkann.", "id": "3"}

{"text": "BeschwerdenimZusammenhangmitnekrotisierenderZellulitisundnekrotisierenderFasziitiswerdenhierbesprochen.", "id": "4"}

{"text": "Empf\u00e4ngervonOrgantransplantatenzeigeneinerh\u00f6htesRisikof\u00fcrdieInfektiondurchNTMaufgrundeinergeschw\u00e4chten,zellvermitteltenImmunabwehr,dennochtretenNTM-InfektionenindieserPopulationrelativseltenauf.", "id": "5"}

{"text": "WokannichInformationenzurDiagnosederGaucher-KrankheitoderzumUmgangdamitfinden?", "id": "

In [5]:
import os
import json

# Input and output directories
input_dir = "/home/IAIS/jdatta/output"
output_dir = "/home/IAIS/jdatta/output"

# Initialize a counter for unique document IDs
document_id_counter = 0  # Start from 1

# Process each JSONL file in the input directory in sorted order (to ensure we start with file 0)
for filename in sorted(os.listdir(input_dir)):
    if filename.endswith(".jsonl"):
        input_file_path = os.path.join(input_dir, filename)
        output_file_path = os.path.join(output_dir, filename)

        with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, "w", encoding="utf-8") as output_file:
            for line in input_file:
                data = json.loads(line)
                
                # Assign the current document ID counter value and then increment it
                data["id"] = str(document_id_counter)  # Using str() to ensure the ID is a string. Remove if you prefer integer IDs.
                document_id_counter += 1
                
                output_file.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Sequential ID assignment completed. Updated files are in", output_dir)


Sequential ID assignment completed. Updated files are in /home/IAIS/jdatta/output


In [4]:
import json
import os

# Define the directories and file names
doc_check_dir = '/home/IAIS/jdatta/output'
#raw_dir = '/data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original'

# Create the directories if they don't exist
os.makedirs(doc_check_dir, exist_ok=True)
os.makedirs(raw_dir, exist_ok=True)

json_file_path = os.path.join(doc_check_dir, 'forum_data_0.json')
jsonl_file_path = os.path.join(raw_dir, 'forum_data_0.jsonl')

# Initialize counter for written items
written_items = 0

try:
    # Read the JSON file
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)
    print(f"Read items from {json_file_path}.")
    
    # Validate that json_data is a dictionary
    if not isinstance(json_data, dict):
        print("The JSON data is not a dictionary. Exiting.")
        exit(1)

    # Write the JSONL file
    with open(jsonl_file_path, 'w') as jsonl_file:
        for key, value in json_data.items():
            
            # Transform the data into the desired format
            json_object = {"text": value, "id": key}
            
            # Write the transformed JSON object to the JSONL file
            jsonl_file.write(json.dumps(json_object) + '\n')
            written_items += 1
    
    print(f"Converted {json_file_path} to {jsonl_file_path}.")
    print(f"Wrote {written_items} items to {jsonl_file_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except PermissionError as e:
    print(f"Permission error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Read items from /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/dataset_doccheck.json.
Converted /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/dataset_doccheck.json to /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/doc_check.jsonl.
Wrote 13136 items to /data/share/project/smart_hospital/medical_dataset/doc_check/01_raw/original/doc_check.jsonl.


In [2]:
######### Ufal ###########

import json
import os


# Define the directories and file names
doc_check_dir = '/data/share/project/smart_hospital/medical_dataset/ufal'
raw_dir = '/data/share/project/smart_hospital/medical_dataset/ufal/01_raw'

# Create the directories if they don't exist
os.makedirs(doc_check_dir, exist_ok=True)
os.makedirs(raw_dir, exist_ok=True)

json_file_path = os.path.join(doc_check_dir, 'dataset_ufal.json')
jsonl_file_path = os.path.join(raw_dir, 'ufal.jsonl')

# Initialize counter for written items
written_items = 0

try:
    # Read the JSON file
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)
    print(f"Read items from {json_file_path}.")
    
    # Validate that json_data is a dictionary
    if not isinstance(json_data, dict):
        print("The JSON data is not a dictionary. Exiting.")
        exit(1)

    # Write the JSONL file
    with open(jsonl_file_path, 'w') as jsonl_file:
        for key, value in json_data.items():
            # Transform the data into the desired format
            json_object = {"text": value, "id": key}
            
            # Write the transformed JSON object to the JSONL file
            jsonl_file.write(json.dumps(json_object) + '\n')
            written_items += 1
    
    print(f"Converted {json_file_path} to {jsonl_file_path}.")
    print(f"Wrote {written_items} items to {jsonl_file_path}.")

except FileNotFoundError as e:
    print(f"File not found: {e}")
except PermissionError as e:
    print(f"Permission error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Read items from /data/share/project/smart_hospital/medical_dataset/ufal/dataset_ufal.json.
Converted /data/share/project/smart_hospital/medical_dataset/ufal/dataset_ufal.json to /data/share/project/smart_hospital/medical_dataset/ufal/01_raw/ufal.jsonl.
Wrote 37814533 items to /data/share/project/smart_hospital/medical_dataset/ufal/01_raw/ufal.jsonl.


In [9]:
import json

def convert_json_to_jsonl(json_file_path, jsonl_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)  # Load JSON data from file

    # Assuming the relevant data is under 'board' -> 'Externe Umfragen/Studien'
    posts = data['board']['Externe Umfragen/Studien']

    with open(jsonl_file_path, 'w') as jsonl_file:
        for post in posts:
            jsonl_file.write(json.dumps(post) + '\n')

# Example usage
convert_json_to_jsonl('/home/IAIS/jdatta/output/forum_data_0.json', '/home/IAIS/jdatta/output/output.jsonl')


In [14]:
import json

input_file_path = "/home/IAIS/jdatta/output/output.jsonl"
output_file_path = "/home/IAIS/jdatta/output/test-output.jsonl"

def process_line(line):
    data = json.loads(line)
    
    # Extract texts from responses
    responses_texts = [resp["text"] for resp in data["responses"].values()]

    # Merge author's post text with responses texts
    full_text = data["author_post"]["text"] + " " + " ".join(responses_texts)
    
    # Reformatted data keeping the necessary structure
    reformatted_data = {
        "Betreff / Begonnen von": data["Betreff / Begonnen von"],
        "text": full_text
    }
    return reformatted_data

with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        reformatted_data = process_line(line)
        json.dump(reformatted_data, output_file, ensure_ascii=False)
        output_file.write('\n')