In [11]:
import pandas as pd 
import numpy as np

In [12]:
def reformat_data(n, r):
    folder_name = "../kde_ebm/datasets/data/synthetic"
    comb_str = f"{int(n*r)}|{n}"
    df = pd.read_csv(f"{folder_name}/{comb_str}.csv")
    n_biomarkers = len(df.biomarker.unique())
    diseased_dic = dict(zip(df.participant, df.diseased))
    dff = df.pivot(index=['participant'], columns='biomarker', values=['measurement'])
    # Define the desired order of the columns
    desired_order = ['HIP-FCI (1)', 'PCC-FCI (2)', 'HIP-GMI (3)', 'FUS-GMI (4)', 'FUS-FCI (5)']
    # Reorder the DataFrame columns
    # level=1 to match the multi-index column level if using a pivot
    dff = dff.reindex(columns=desired_order, level=1) 
    dff['diseased'] = [int(diseased_dic[x]) for x in dff.index]
    new_file_dir = f"{folder_name}/{comb_str}_reformatted.csv"
    dff.to_csv(new_file_dir, index=False, header=None)    
    # Prepend the custom line to the file
    with open(new_file_dir, "r+") as file:
        content = file.read()  # Read the existing content
        file.seek(0, 0)  # Move the cursor to the beginning of the file
        file.write(f"{n},{n_biomarkers},CN,AD\n" + content)  # Write the new line and then the original content

In [13]:
def reformat_chen_data(n):
    cols_to_load = ["participant", "biomarker", "measurement", 'diseased']
    folder_name = "../kde_ebm/datasets/data/chen_data"
    comb_str = f"{int(n)}"
    df = pd.read_csv(f"{folder_name}/{comb_str}.csv", usecols=cols_to_load)
    n_biomarkers = len(df.biomarker.unique())
    diseased_dic = dict(zip(df.participant, df.diseased))
    dff = df.pivot(index=['participant'], columns='biomarker', values=['measurement'])
    desired_order = ['FCI(HIP)-1', 'FCI(PCC)-2', 'GMI(HIP)-3', 'GMI(FUS)-4', 'FCI(Fusi)-5']
    # Reorder the DataFrame columns
    # level=1 to match the multi-index column level if using a pivot
    dff = dff.reindex(columns=desired_order, level=1)  
    dff['diseased'] = [int(diseased_dic[x]) for x in dff.index]
    new_file_dir = f"{folder_name}/{comb_str}_reformatted.csv"
    dff.to_csv(new_file_dir, index=False, header=None)    
    # Prepend the custom line to the file
    with open(new_file_dir, "r+") as file:
        content = file.read()  # Read the existing content
        file.seek(0, 0)  # Move the cursor to the beginning of the file
        file.write(f"{n},{n_biomarkers},CN,AD\n" + content)  # Write the new line and then the original content

In [14]:
ns = [50, 200, 500]
rs = [0.1, 0.25, 0.5]
for n in ns:
    for r in rs:
        reformat_data(n, r)

In [15]:
for n in [144, 500]:
    reformat_chen_data(n)