In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import json

### load MedQA

In [2]:
data_dir = os.getenv('DATA_DIR', 'data46')

os.listdir(data_dir)

['models', 'raw']

In [3]:

def processes_and_safe(input_path, output_path):

    converted = []
    # Load and convert data 
    with open(input_path, "r", encoding="utf-8") as infile:
        for line in infile:
            item = json.loads(line)
            question = item.get("question", "").strip()
            options = item.get("options", {})
            answer_idx = item.get("answer_idx", "").strip()
            answer_text = item.get("answer", "").strip()
    
            if not question or not options or not answer_idx or not answer_text:
                continue  # skip incomplete entries
    
            # Convert options dict into A: xxx\nB: xxx...
            formatted_options = "\n".join([f"{k}: {v}" for k, v in options.items()])
            
            # Format instruction
            prompt = (
                f"你是一位專業的醫療諮詢助理。請根據下列問題及選項，用口語化的方式簡單回覆正確答案並說明理由。\n"
                f"Q: {question}\n{formatted_options}\n請選出正確答案並說明理由。"
                )
    
            # Combine letter and answer text
            answer_label = options.get(answer_idx, "").strip()
            full_output = f"A: {answer_label}。{answer_text} <END>" #add <END> to prevent model to repeat the answer
    
            converted.append({
                "instruction": prompt,
                "input": "",
                "output": full_output
            })
    
    with open(output_path, "w", encoding="utf-8") as outfile:
        for item in converted:
            outfile.write(json.dumps(item, ensure_ascii = False) + "\n")


In [4]:
# train
input_path = os.path.join(data_dir, 'raw/medQA/train.jsonl')
output_path = 'train_formatted.jsonl'
processes_and_safe(input_path, output_path)

# test
input_path = os.path.join(data_dir, 'raw/medQA/test.jsonl')
output_path = 'test_formatted.jsonl'
processes_and_safe(input_path, output_path)

# dev
input_path = os.path.join(data_dir, 'raw/medQA/dev.jsonl')
output_path = 'dev_formatted.jsonl'
processes_and_safe(input_path, output_path)

### load hokkien corpus from hugging face

In [5]:
from datasets import load_dataset, concatenate_datasets, get_dataset_config_names

# Load ICorpus-100
icorpus = load_dataset("BohanLu/ICorpus-100")
# Load TAIDE-14-tasks-Hokkien
taide = load_dataset("BohanLu/TAIDE-14-tasks-Hokkien")

README.md:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

(…)-00000-of-00001-080cdbb3423d2e7d.parquet:   0%|          | 0.00/75.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/37.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/140 [00:00<?, ? examples/s]

In [6]:
from datasets import get_dataset_config_names

all_configs = get_dataset_config_names("BohanLu/TAIDE-14-tasks-Hokkien")
print("Available TAIDE-14 configs:")
for config in all_configs:
    print("-", config)

# Print what splits exist
print("ICorpus-100 splits:", icorpus.keys())
print("TAIDE-14-tasks-Hokkien splits:", taide.keys())

# Now try to print a sample
print("\n--- ICorpus-100 sample ---")
print(icorpus["test"][0])

print("\n--- TAIDE-14-tasks-Hokkien sample ---")
print(taide["train"][0])

Available TAIDE-14 configs:
- default
ICorpus-100 splits: dict_keys(['test'])
TAIDE-14-tasks-Hokkien splits: dict_keys(['train'])

--- ICorpus-100 sample ---
{'ID': 0, 'ZH': '還是在打麻將？', 'HAN': '猶是佇拍麻雀？', 'TL': 'iáu-sī-tī phah-muâ-tshiok？', 'EN': 'Are you still playing mahjong?', 'POJ': 'iáu-sī-tī phah-môa-chhiok？'}

--- TAIDE-14-tasks-Hokkien sample ---
{'Topic': '生物學和生物技術', 'Task': '分類', 'Keywords': '有什麼風險？', 'Prompt': '共下面的生物科技的應用分做三類：低風險、中風險佮高風險，閣簡單解說一下為啥物欲按呢分類？\n基因編輯、生物染料、基因療法、基因工程作物、細胞再生、複製技術、人類胚胎研究、生物能源。'}


In [7]:
def format_icorpus(example):
    return {
        "text": f"<|user|>\n{example['HAN']}\n<|assistant|>\n{example['HAN']}<|endoftext|>"
    }

def format_taide(example):
    return {
        "text": f"<|user|>\n{example['Prompt']}\n<|assistant|>\n<|endoftext|>"
    }


# Map format
formatted_taide = taide["train"].map(format_taide)
formatted_icorpus = icorpus["test"].map(format_icorpus)

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
# Save dataset for stage 2

out_path = "hokkien_pretrain_train.jsonl"
formatted_taide.to_json(out_path, orient = "records", lines = True, force_ascii = False)

out_path = "hokkien_pretrain_test.jsonl"
formatted_icorpus.to_json(out_path, orient = "records", lines = True, force_ascii = False)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

130847

In [12]:
files_to_upload = ["train_formatted.jsonl", "test_formatted.jsonl","dev_formatted.jsonl", "hokkien_pretrain_train.jsonl", "hokkien_pretrain_test.jsonl"]

rclone_container = "object-persist-project46"

if not rclone_container:
    print("ERROR: RCLONE_CONTAINER environment variables are not set.")
else:
    for filename in files_to_upload:
        local_file_path = f"/home/jovyan/work/{filename}"
        remote_destination_path = f"chi_tacc:{rclone_container}/processed"

        if os.path.exists(local_file_path):
            print(f"Uploading {local_file_path} to {remote_destination_path}...")
            get_ipython().system(f'rclone copy "{local_file_path}" "{remote_destination_path}" --progress')
            print(f"Upload command for {filename} executed.")
        else:
            print(f"File not found: {local_file_path}")

    if files_to_upload:
        check_remote_path = f"chi_tacc:{rclone_container}/processed_notebook_outputs/"
        print(f"\nChecking remote directory: {check_remote_path}")
        get_ipython().system(f'rclone ls "{check_remote_path}"')

Uploading /home/jovyan/work/train_formatted.jsonl to chi_tacc:object-persist-project46/processed...
[2K[1GTransferred:   	          0 B / 6.950 MiB, 0%, 0 B/s, ETA -
Transferred:            0 / 1, 0%
Elapsed time:         0.3s
Transferring:
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1GTransferred:   	          0 B / 6.950 MiB, 0%, 0 B/s, ETA -
Transferred:            0 / 1, 0%
Elapsed time:         0.8s
Transferring:
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1GTransferred:   	    6.950 MiB / 6.950 MiB, 100%, 0 B/s, ETA -
Transferred:            1 / 1, 100%
Elapsed time:         1.1s
Upload command for train_formatted.jsonl executed.
Uploading /home/jovyan/work/test_formatted.jsonl to chi_tacc:object-persist-project46/processed...
[2K[1GTransferred:   	          0 B / 901.610 KiB, 0%, 0 B/s, ETA -
Transferred:            0 / 1, 0%
Elapsed time:         0.3s
Transferring:
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1GTransferred:   	          0 B / 901.610 KiB, 0%, 0 B/s, ETA -
Transferred:        

In [13]:
# remove files

for file_path in files_to_upload:

    if os.path.exists(file_path):
        try:
            os.remove(file_path)
            print(f"'{file_path} removed.")
        except OSError as e:
            print(f"{e.strerror}")
    else:
        print(f"'{file_path}' does not exist.")

'train_formatted.jsonl removed.
'test_formatted.jsonl removed.
'dev_formatted.jsonl removed.
'hokkien_pretrain_train.jsonl removed.
'hokkien_pretrain_test.jsonl removed.
