In [1]:
import pandas as pd
import os
import json
import os
from utils import split_data, convert_to_parquet, combine_window
import ast
import numpy as np

In [7]:
def collapse_json(data):
    collapsed_string = ""
    if data is not np.nan:
        data = ast.literal_eval(data)
        for key, values in data.items():
            collapsed_string += f"{key}: " + "; ".join(map(str, values)) + ". "
        collapsed_string = collapsed_string.replace("_", " ")
        return collapsed_string.strip()
    else:
        return "Not available"

# Generate the input and output 

In [8]:
dir = '../Dataset/medical-raw'
output_dir = '../Dataset/medical'

for file in os.listdir(dir):
    df = pd.read_csv(os.path.join(dir, file))
    df['summary'] = df['summary'].apply(collapse_json)
    # df['Heart_Rate'] = df['Heart_Rate'].apply(lambda x: json.dumps({patient_idx: x}))
    # df['Heart_Rate'] = df['Heart_Rate'].apply(ast.literal_eval)

    df['summary_heart_rate'] = df.apply(lambda x: json.dumps({"summary": x['summary'], "Heart_Rate": x['Heart_Rate']}), axis=1)
    df['fut_summary_heart_rate'] = df['summary_heart_rate'].shift(-1)

    df['summary'] = df.apply(lambda x: json.dumps({"summary": x['summary']}), axis=1)
    df['fut_summary'] = df['summary'].shift(-1)

    df = df[['fut_summary_heart_rate', 'summary_heart_rate', 'fut_summary', 'summary']]

    df.to_csv(os.path.join(output_dir, file), index=False)


# Split the file into train, test, and validation

In [9]:
dir = '../Dataset/medical'
for file in os.listdir(dir):
    if file.startswith('train') or file.startswith('test') or file.startswith('val'):
        continue
    else:
        split_data(os.path.join(dir, file))

# text + number => text + number


In [10]:
dir = '../Dataset/medical'
window_sizes = [1, 2, 3]

def create_mixed_mixed(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        example_output[f"{unit}_{1 + window_size}"] = {"summary": "...", "heart rate": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the health summary and the heart rate of the patient for {window}, please predict the next 1 day's heart rate and health summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary_heart_rate', 'fut_summary_heart_rate']].rename(columns={'summary_heart_rate': 'input', 'fut_summary_heart_rate': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_mixed_mixed(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)

    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/medical-{window_size}_day-mixed-mixed", token=token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/605 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

#  text => text


In [11]:
dir = '../Dataset/medical'
window_sizes = [1, 2, 3]

def create_text_text(filename, window_size, unit):
    if (filename.startswith('train') or filename.startswith('test') or filename.startswith('val')):
        path = os.path.join(dir, filename)
        summaries = pd.read_csv(path)

        if window_size == 1:
            window = f"{window_size} {unit}"
        else:
            window = f"{window_size} {unit}s"
        example_output = {}
        example_output[f"{unit}_{1 + window_size}"] = {"summary": "..."}
        example_output = json.dumps(example_output)
        instruction = f"Given the health summary of the patient for {window}, please predict the next 1 day's health summary within a JSON. The example output is {example_output}"
        
        df = summaries[['summary', 'fut_summary']].rename(columns={'summary': 'input', 'fut_summary': 'output'})
        df['instruction'] = instruction
        df['pred_output'] = "Not available"
        # skip the first and last historical_size days
        return combine_window(df, window_size, "day")

for window_size in window_sizes:
    dataframe_train = []
    dataframe_test = []
    dataframe_val = []
    for filename in os.listdir(dir):
        df = create_text_text(filename, window_size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df)
        elif filename.startswith('train'):
            dataframe_train.append(df)
        elif filename.startswith('val'):
            dataframe_val.append(df)

    dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
    token = os.getenv("HF_TOKEN")
    # Push the dataset to the Hugging Face Hub
    dataset_dict.push_to_hub(f"Howard881010/medical-{window_size}_day-text-text", token=token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/605 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [12]:
dir = '../Dataset/medical'
dataframe_train = []
dataframe_test = []
dataframe_val = []
window_size = 3


for filename in os.listdir(dir):
    for size in range(1, window_size+1):
        df1 = create_text_text(filename, size, "day")
        df2 = create_mixed_mixed(filename, size, "day")
        if filename.startswith('test'):
            dataframe_test.append(df1)
            dataframe_test.append(df2)
        elif filename.startswith('train'):
            dataframe_train.append(df1)
            dataframe_train.append(df2)
        elif filename.startswith('val'):
            dataframe_val.append(df1)
            dataframe_val.append(df2)
        
dataset_dict = convert_to_parquet(dataframe_test, dataframe_train, dataframe_val)
token = os.getenv("HF_TOKEN")
# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(f"Howard881010/medical", token=token)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/655 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Howard881010/medical/commit/ae39cc8efa2ae59326446499180606d5e59ee684', commit_message='Upload dataset', commit_description='', oid='ae39cc8efa2ae59326446499180606d5e59ee684', pr_url=None, pr_revision=None, pr_num=None)