In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
data_final_cl = pd.read_csv("new_mtx/data_add_2ydas_jhaq_scdai_example.csv", sep = ",")

In [4]:
# Create a dictionary to map the column names to their full names
drug_dict = {
    'nsaid': 'NSAIDS', 'gi': 'stomach medicine', 'osteo': 'osteoporosis medicine', 'fa': 'folic acid', 'mtxdose': 'MTX dose', 
    'mtx': 'methotrexate', 'buc': 'bucillamine', 'ssz': 'sulfasalazine', 
    'tac': 'tacrolimus'
}

# Add medical history dictionary
medical_history_dict = {
    'Angina pectoris, myocardial infarction': 'Angina pectoris or myocardial infarction',
    'Cerebral hemorrhage, cerebral infarction, subarachnoid hemorrhage': 'Cerebral hemorrhage or cerebral infarction or subarachnoid hemorrhage',
    'high blood pressure': 'high blood pressure',
    'heart failure': 'heart failure',
    'interstitial pneumonia': 'interstitial pneumonia',
}

obj = ["cdai_2y"]

# List to store the generated text
texts = []
label = []
i = 0
for obj2y in obj: 
    # Load the new data
    data_final = data_final_cl[data_final_cl[obj2y].notna()]
    label = label + list(data_final[obj2y])

    data_final["obj"] = obj2y
    if i == 0:
        data_all = data_final
    else:
        data_all = pd.concat([data_all, data_final], axis = 0)
    i += 1
    
    # Iterate over the rows of the DataFrame
    for _, row in data_final.iterrows():
        # Create the initial part of the text
        text = f"{row['age']:.2f}-year-old, {'male' if row['gender'] == 0 else 'female'}, rheumatoid arthritis patient with disease duration of {row['duration']:.2f} years."
        text += f" Current disease activity is {row['das28']:.2f} for das28esr, {row['das28crp']:.2f} for das28crp, {row['cdai']:.2f} for cdai."
        text += f" Current quality of life indicators is jhaq of {row['jhaq']:.2f}."
        text += f" pvas (patient visual analog scale) is {row['pvas']:.2f}."
        text += f" laboratory tests include crp of {row['crp']:.2f} mg/dl, esr {row['esr']:.2f} mm, rf (Rheumatoid factor) {row['rf']:.2f} IU/mL"

        # Add the information about the drugs
        drugs = []
        for col in drug_dict:
            if pd.notnull(row[col]):
                if col == 'mtx' and row[col] == 1:
                    if pd.notnull(row['mtxdose']):
                        drugs.append(f"{drug_dict[col]} {row['mtxdose']:.2f} mg")
                    else:
                        drugs.append(f"{drug_dict[col]} (dose unknown)")
                elif row[col] == 1:
                    drugs.append(drug_dict[col])
        if drugs:
            text += " His oral medications are " + ", ".join(drugs) + "."

        # Within the loop, after the drugs part:
            # Add medical history information
            histories = []
            no_history = True  # Assume there's no history to start with
            for col in medical_history_dict:
                if pd.notnull(row[col]) and (row[col] == 2 or row[col] == 3):
                    histories.append(medical_history_dict[col])
                    no_history = False  # Update the flag if any history is found
            if histories:
                text += " The patient has a history of " + ", ".join(histories) + "."
            elif no_history:
                text += " The patient has no medical history."

        # Add the final part of the text
        text += f"What is the probability that this patient's {obj2y.replace('_2y', '')} will be remission at 2 years?"
        
        # Add the generated text to the list
        texts.append(text)
        
data_all = data_all.reset_index(drop=True)
data_all.head()

Unnamed: 0,gender,age,duration,das28,das28crp,cdai,jhaq,pvas,crp,esr,...,tac,cdai_2y,"Angina pectoris, myocardial infarction","Cerebral hemorrhage, cerebral infarction, subarachnoid hemorrhage",high blood pressure,heart failure,interstitial pneumonia,cdai_2y_h_bin,cdai_2y_l_bin,obj
0,0,57,16,5.2,5.0,22.1,0.4,34,2.0,22.0,...,0,7.9,1,1,1,1,1,0,0,cdai_2y
1,1,44,9,2.2,2.4,6.7,0.0,24,0.0,3.3,...,0,12.4,1,1,1,1,1,0,0,cdai_2y
2,1,87,17,3.4,2.5,1.5,1.3,4,1.4,59.0,...,0,4.6,1,1,1,1,1,0,0,cdai_2y
3,0,59,14,1.3,1.4,0.0,0.0,0,0.2,6.0,...,0,0.0,1,1,1,1,1,0,1,cdai_2y
4,1,74,11,3.0,1.7,0.0,0.3,0,0.8,74.0,...,0,3.5,1,1,1,1,1,0,0,cdai_2y


In [5]:
np.random.seed(0) 

# shuffle index
indices = np.arange(len(texts))
np.random.shuffle(indices)

# This sample script splits data at a ratio of 8:2
split_idx = int(0.8 * len(texts))
train_indices = indices[:split_idx]
test_indices = indices[split_idx:]

def create_dataset(indices):
    data = []
    for i in indices:
        output = str(label[i])
        data.append({
            "task":data_all["obj"][i], 
            "input": texts[i],
            "output": output,
            "index": int(i)  
        })
    return data

train_data = create_dataset(train_indices)
test_data = create_dataset(test_indices)


with open('new_mtx/Training_sample.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False)

with open('new_mtx/Test_sample.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False)

5
[2 0 1 3]
