In [21]:
import bz2
import cbor
import json
import os
import pathlib

In [22]:
import numpy as np
import pandas as pd
import torch

In [23]:
from tqdm import tqdm

In [24]:
import torchcde

In [28]:
def processfile(datafile, outdir, interp = "cubic"):
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)
    
    with bz2.open(datafile, 'rt', encoding="utf-8") as f:
        covid19_data = json.load(f)
    
    index_data = dict()
    
    time_index = covid19_data['time_index']
    time_index_df = pd.DataFrame({'time_index': time_index})
    patient_idx = covid19_data['info'].keys()

    for patient_id in tqdm(sorted(patient_idx)):
        x_array = []
        y_array = []

        observation_idx = covid19_data['info'][patient_id].keys()
        for observation_id in sorted(observation_idx):
            duration = covid19_data['outcome'][observation_id]['time']
            event = covid19_data['outcome'][observation_id]['outcome']
            if duration == 0:
                break
            y_array.append([duration, event])
            x = pd.DataFrame(covid19_data['data'][observation_id]).fillna(value=np.nan)
            x = pd.merge_ordered(time_index_df, x, left_on='time_index', right_on=0, fill_method=None)
            x = x.drop(['time_index', 0], axis=1)
            x = x.to_numpy()
            x_mask = (~torch.isnan(torch.Tensor(x))).cumsum(dim=0).cpu()
            x = pd.concat([pd.DataFrame(time_index), pd.DataFrame(x), pd.DataFrame(x_mask.numpy())], axis=1).to_numpy()
            x_array.append(x)

        if len(y_array) != 0:
            x_array = torch.Tensor(x_array)
            y_array = torch.Tensor(y_array)

            if interp == "linear":
                x_array = torchcde.linear_interpolation_coeffs(x_array)
            else:
                x_array = torchcde.natural_cubic_coeffs(x_array)

            index_data[patient_id] = y_array.numpy().tolist()
            with open(pathlib.Path(outdir) / (patient_id + ".json"), 'wt', encoding="utf-8") as f:
                json.dump(x_array.numpy().tolist(), f)
                
    with open(pathlib.Path(outdir) / "index.json", 'wt', encoding="utf-8") as f:
        json.dump(index_data, f)

In [26]:
for i in [30]:
    for j in [12]:
        for k in ["cubic", "linear"]:
            print(f"Processing file for {i} patients and {j} features with {k} interpolation\n")
            processfile(i, j, k)

  0%|          | 0/30 [00:00<?, ?it/s]

Processing file for 30 patients and 12 features with cubic interpolation



100%|██████████| 30/30 [00:19<00:00,  1.56it/s]


Processing file for 30 patients and 12 features with linear interpolation



100%|██████████| 30/30 [00:03<00:00,  8.95it/s]


In [29]:
datafile = "../data/simple-0.25-0.05-0.85" ".json.bz2"
outdir = "../data/simple-0.25-0.05-0.85-" + interp
processfile(datafile, outdir, "cubic")

100%|██████████| 995/995 [44:23<00:00,  2.68s/it]  
