In [None]:
%load_ext autoreload
%autoreload 2

import os
import re
import shutil
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
import pyreadstat

from koafusion.datasets.oai import (prefix_var_to_visit_month,
                                    release_to_visit_month,
                                    side_code_to_str)

In [None]:
DIR_PROJECT_ROOT = # TODO: set to project root directory
DIR_DATA_ROOT = Path(DIR_PROJECT_ROOT, "data")

## 1. Read the meta info

In [None]:
def read_compose_asmts(fpaths, capitalize=False, verbose=False):
    """ """
    if verbose:
        print(fpaths)
    dfs = []

    for i, fpath in enumerate(fpaths):
        if fpath.suffix in (".csv", ".txt"):
            df = pd.read_csv(fpath, sep='|', index_col=False)
        elif fpath.suffix == ".sas7bdat":
            df, _ = pyreadstat.read_sas7bdat(fpath, user_missing=True)
        else:
            raise ValueError(f"Unsupported extension: {fpath.suffix}")
        
        if capitalize:
            # Capitalize all columns names
            df.columns = df.columns.str.upper()

        # Find release info
        prefix_var = 'VXX'
        for c in df.columns:
            if re.match("V\d\d.*$", c):
                prefix_var = c[:3]
                break
        # Remove prefix from column names and add corresponding column
        columns = []
        for c in df.columns:
            if c.startswith(prefix_var):
                columns.append(c[3:])
            else:
                columns.append(c)
        df.columns = columns
        df.loc[:, 'PREFIX_VAR'] = prefix_var

        if verbose:
            print(f'df idx: {i} num: {len(df)}')
        dfs.append(df)

    if len(dfs) > 1:
        out = pd.concat(dfs, axis=0)
    else:
        out = dfs[0]
    if verbose:
        print(f"num total: {len(out)}")
    return out

In [None]:
def read_compose_contents(paths):
    dfs = []
    for p in paths:
        df = pd.read_csv(p, index_col=False, sep=",",
                         dtype={"Folder": str,
                                "ParticipantID": str,
                                "StudyDate": int,
                                "SeriesDescription": str})
        df.loc[:, "visit_month"] = [release_to_visit_month[e.split("/")[0]]
                                    for e in df["Folder"].tolist()]
        df.loc[:, "visit"] = [int(p[:-1]) for p in df["visit_month"].tolist()]
        
        df = df.rename(columns={"ParticipantID": "patient"})
        
        dfs.append(df)
    out = pd.concat(dfs, axis=0, ignore_index=True)
    return out


def preproc_contents(df):
    # Preprocess imaging inventory for easier merging
    df_t = df.copy()
    df_t["sequence"] = ""
    df_t["side"] = ""
    
    mapping = {
        # series_in: (series_out, side)
        'MP_LOCATOR_LEFT': ('MP_LOCATOR', "LEFT"),
        'MP_LOCATOR_RIGHT': ('MP_LOCATOR', "RIGHT"),
        'COR_IW_TSE_LEFT': ('COR_IW_TSE', "LEFT"),
        'COR_IW_TSE_RIGHT': ('COR_IW_TSE', "RIGHT"),
        'PA Fixed Flexion Left Knee': ('PA Fixed Flexion Knee', "LEFT"),
        'PA Fixed Flexion Right Knee': ('PA Fixed Flexion Knee', "RIGHT"),
        'SAG_T2_CALC_LEFT': ('SAG_T2_CALC', "LEFT"),
        'SAG_T2_CALC_RIGHT': ('SAG_T2_CALC', "RIGHT"),
        'SAG_3D_DESS_LEFT': ('SAG_3D_DESS', "LEFT"),
        'SAG_3D_DESS_RIGHT': ('SAG_3D_DESS', "RIGHT"),
        'COR_MPR_LEFT': ('COR_MPR', "LEFT"),
        'COR_MPR_RIGHT': ('COR_MPR', "RIGHT"),
        'AX_MPR_LEFT': ('AX_MPR', "LEFT"),
        'AX_MPR_RIGHT': ('AX_MPR', "RIGHT"),
        'SAG_IW_TSE_LEFT': ('SAG_IW_TSE', "LEFT"),
        'SAG_IW_TSE_RIGHT': ('SAG_IW_TSE', "RIGHT"),
        'COR_T1_3D_FLASH_LEFT': ('COR_T1_3D_FLASH', "LEFT"),
        'COR_T1_3D_FLASH_RIGHT': ('COR_T1_3D_FLASH', "RIGHT"),
        'SAG_T2_MAP_LEFT': ('SAG_T2_MAP', "LEFT"),
        'SAG_T2_MAP_RIGHT': ('SAG_T2_MAP', "RIGHT"),
        'Bilateral PA Fixed Flexion Knee': ('Bilateral PA Fixed Flexion Knee', "OTHER"),
        'Full Limb': ('Full Limb', "OTHER"),
        'MP_LOCATOR_THIGH': ('MP_LOCATOR_THIGH', "OTHER"),
        'AX_T1_THIGH': ('AX_T1_THIGH', "OTHER"),
        'PRESCRIPTION_THIGH': ('PRESCRIPTION_THIGH', "OTHER"),
        'Lateral Left Knee': ('Lateral Knee', "LEFT"),
        'Lateral Right Knee': ('Lateral Knee', "RIGHT"),
        'AP Pelvis': ('AP Pelvis', "OTHER"),
        'PA Left Hand': ('PA Hand', "LEFT"),
        'PA Right Hand': ('PA Hand', "RIGHT"),
        'PA Bilateral Hand': ('PA Bilateral Hand', "OTHER"),
        'OTHER': ('OTHER', "OTHER"),
    }

    df_proc = df_t.assign(
        sequence=lambda x: [mapping[e][0] for e in x["SeriesDescription"].tolist()],
        side=lambda x: [mapping[e][1] for e in x["SeriesDescription"].tolist()]
    )
    return df_proc


dir_contents = Path(DIR_DATA_ROOT, "contents")
paths_contents = dir_contents.glob("*.csv")

df_contents = read_compose_contents(paths_contents)
display(df_contents.head())

df_contents_proc = preproc_contents(df_contents)
display(df_contents_proc.head())

In [None]:
def read_compose_clinical(paths):
    df = read_compose_asmts(paths)

    df.columns = df.columns.str.upper()
    df = df.rename(columns={"ID": "patient", })
    
    df = df.astype({"patient": str})
    
    df.loc[:, "visit_month"] = [prefix_var_to_visit_month[p]
                                for p in df["PREFIX_VAR"].tolist()]
    df.loc[:, "visit"] = [int(p[:-1]) for p in df["visit_month"].tolist()]

    sel = [
        "patient", "PREFIX_VAR", "visit_month", "visit",
        "AGE", "P01BMI", "LKDEFCV", "RKDEFCV",
        "P01INJR", "P01INJR1", "P01INJR2", "P01INJR3", 
        "P01KSURGR", "P01KRSR", "P01KRSRA", 
        "P01ARTR", "P01ARTR1", "P01ARTR2", "P01ARTR3", "P01ARTRINJ", 
        "P01MENR", "P01MENR1", "P01MENR2", "P01MENR3", "P01MENRINJ", 
        "P01LRR", "P01LRR1", "P01LRR2", "P01LRR3", "P01OTSURGR",
        "P01OTSR1", "P01OTSR2", "P01OTSR3", "P01OTSRINJ",
        "P01INJL", "P01INJL1", "P01INJL2", "P01INJL3", 
        "P01KSURGL", "P01KRSL", "P01KRSLA", 
        "P01ARTL", "P01ARTL1", "P01ARTL2", "P01ARTL3", "P01ARTLINJ", 
        "P01MENL", "P01MENL1", "P01MENL2", "P01MENL3", "P01MENLINJ", 
        "P01LRL", "P01LRL1", "P01LRL2", "P01LRL3", 
        "P01OTSURGL", "P01OTSL1", "P01OTSL2", "P01OTSL3", "P01OTSLINJ", 
        "P01RHBE", "P01LHBE", "RKPFCRE", "LKPFCRE", 
        "WSRKN1", "WSRKN2", "WSLKN1", "WSLKN2", 
        "KOOSYMR", "KOOSYML", "KOOSKPL", "KOOSKPR",
        "KPA30CV", "KPACDCV", "KPACT30", "KPACTCV", "KPMED", "KPMEDCV", 
        "KPNL12", "KPNR12", "KPNL12M", "KPNR12M", "KPL12CV", "KPR12CV", 
        "KPL30CV", "KPR30CV", "KPRK20B", "KPLK20B", "KPLK20D", "KPRK20D",
        "KPLKN1", "KPLKN2", "KPLKN3", "KPRKN1", "KPRKN2", "KPRKN3",
        "MISSWK", "PMLKRCV", "PMRKRCV", "LKSX", "RKSX",
        "WOMADLL", "WOMADLR", "WOMKPL", "WOMKPR", "WOMSTFL", "WOMSTFR", "WOMTSL", "WOMTSR",  # WOMAC
        "WPLKN1", "WPLKN2", "WPLKN3", "WPLKN4", "WPLKN5",
        "WPRKN1", "WPRKN2", "WPRKN3", "WPRKN4", "WPRKN5",
        "P01SVLKOST", "P01SVRKOST",
        "KRSR12", "KRSL12",  # knee replacement since last visit, right and left
    ]
    df = df.loc[:, sel]
    return df


def print_unique(df, fields):
    for field in fields:
        print(field, pd.unique(df[field]))


def preproc_clinical(df):
    # Harmonize the values and fill the missing
    for field in ("PMLKRCV", "PMRKRCV"):
        dict_fix = {
            np.nan: -1, ' ': -1,
            '.: Missing Form/Incomplete Workbook': -1,
            0.0: 0, '0': 0, '0: No pain': 0,
            1.0: 1, '1': 1, '1: 1': 1,
            2.0: 2, '2': 2, '2: 2': 2,
            3.0: 3, '3': 3, '3: 3': 3,
            4.0: 4, '4': 4, '4: 4': 4,
            5.0: 5, '5': 5, '5: 5': 5,
            6.0: 6, '6': 6, '6: 6': 6,
            7.0: 7, '7': 7, '7: 7': 7,
            8.0: 8, '8': 8, '8: 8': 8,
            9.0: 9, '9': 9, '9: 9': 9,
            10.0: 10, '10': 10, '10: Pain as bad as you can imagine': 10,
        }
        df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})

    for field in ("KPL30CV", "KPR30CV", "KRSL12", "KRSR12",
                  "P01INJL", "P01INJR", "P01KSURGL", "P01KSURGR",
                  "P01KRSL", "P01KRSR",
                  "P01ARTL", "P01ARTR", "P01ARTLINJ", "P01ARTRINJ",
                  "P01MENL", "P01MENR", "P01MENLINJ", "P01MENRINJ",
                  "P01LRL", "P01LRR",
                  "P01OTSURGL", "P01OTSURGR", "P01OTSLINJ", "P01OTSRINJ",
                 ):
        dict_fix = {
            '1: Yes': 1, 1.0: 1, '1': 1,
            '0: No': 0, 0.0: 0, '0': 0,
            '.: Missing Form/Incomplete Workbook': -1, ' ': -1, np.nan: -1,
        }
        df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})

    for field in ("WOMADLL", "WOMADLR", "WOMKPL", "WOMKPR",
                  "WOMSTFL", "WOMSTFR", "WOMTSL", "WOMTSR"):
        df = df.replace({field: {" ": -1, np.nan: -1}})
        df = df.fillna({field: -1}, axis=0)
        df = df.astype({field: float})
        
    # Normalize the value ranges
    for field in ("WOMKPL", "WOMKPR"):  # WOMAC pain, range [-1, 0-20] -> [-1, 0-100]
        df.loc[:, field] = df[field].apply(lambda x: x * 5 if x > 0 else x)

    # Melt "side"-specific columns
    df = pd.concat([df.assign(**{"side": "LEFT"}),
                    df.assign(**{"side": "RIGHT"})],
                    axis="index",
                    ignore_index=True)

    for f_left, f_right, f_out in [
        ("PMLKRCV", "PMRKRCV", "PM-KRCV"),
        ("KPL30CV", "KPR30CV", "KP-30CV"),
        
        ("WOMADLL", "WOMADLR", "WOMADL-"),  # WOMAC physical disability, range [-1, 0-68]
        ("WOMKPL", "WOMKPR", "WOMKP-"),  # WOMAC pain
        ("WOMSTFL", "WOMSTFR", "WOMSTF-"),  # WOMAC stiffness, range [-1, 0-8]
        ("WOMTSL", "WOMTSR", "WOMTS-"),  # WOMAC total score, range [-1, 0-96]
        ("KRSL12", "KRSR12", "KRS-12"),  # knee replacement surgery
        
        ("P01INJL", "P01INJR", "P01INJ-"),  # injury with loss of ability to walk
        ("P01KSURGL", "P01KSURGR", "P01KSURG-"),  # surgery or arthroscopy
        ("P01KRSL", "P01KRSR", "P01KRS-"),  # part or whole joint replacement
        
        ("P01ARTL", "P01ARTR", "P01ART-"),  # arthroscopy
        ("P01ARTLINJ", "P01ARTRINJ", "P01ART-INJ"),  # arthroscopy after injury
        ("P01MENL", "P01MENR", "P01MEN-"),  # meniscectomy
        ("P01MENLINJ", "P01MENRINJ", "P01MEN-INJ"),  # meniscectomy after injury

        ("P01LRL", "P01LRR", "P01LR-"),  # ligament repair surgery
        ("P01OTSURGL", "P01OTSURGR", "P01OTSURG-"),  # any other surgery
        ("P01OTSLINJ", "P01OTSRINJ", "P01OTS-INJ"),  # any other surgery after injury
    ]:
        df = df.assign(**{f_out: ""})

        df.loc[df["side"] == "LEFT", f_out] = df[f_left]
        df.loc[df["side"] == "RIGHT", f_out] = df[f_right]

        df = df.astype({f_out: df[f_left].dtype})
        df = df.drop(columns=[f_left, f_right])
    # ---

    return df


dir_clinical = Path(DIR_DATA_ROOT, "OAI_general/OAI_CompleteData_ASCII")
paths_clinical = sorted(dir_clinical.glob("AllClinical??.txt"))

df_clinical = read_compose_clinical(paths_clinical)

df_clinical = preproc_clinical(df_clinical)
print(len(df_clinical))
df_clinical.head()

In [None]:
def read_enrollees(path):
    df = pd.read_csv(path, index_col=False, sep="|", dtype={"ID": str, })

    df = df.rename(columns={"ID": "patient"})

    df = df.replace({"P02SEX": {"1: Male": "MALE", "2: Female": "FEMALE"}})
    
    sel = ["patient", "P02SEX", "P02RACE", "V00SITE"]
    df = df.loc[:, sel]
    return df


dir_enrollees = Path(DIR_DATA_ROOT, "OAI_general/OAI_CompleteData_ASCII")
paths_enrollees = Path(dir_enrollees, "Enrollees.txt")

df_enrollees = read_enrollees(paths_enrollees)
print(len(df_enrollees))
df_enrollees.head()

In [None]:
def read_compose_xr_sq(paths):
    df = read_compose_asmts(paths, capitalize=True)
    
    df = df.rename(columns={"ID": "patient",
                            "SIDE": "side", })
    
    df = df.astype({"patient": str})
    
    df.loc[:, "side"] = [side_code_to_str[c] for c in df["side"].tolist()]
    df.loc[:, "visit_month"] = [prefix_var_to_visit_month[p]
                                for p in df["PREFIX_VAR"].tolist()]
    df.loc[:, "visit"] = [int(p[:-1]) for p in df["visit_month"].tolist()]
    
    sel = ["patient", "side", "PREFIX_VAR", "visit_month", "visit",
           "XRKL",
           "XROSFL", "XROSFM",  # osteophytes, femur (OARSI grades)
           "XROSTL", "XROSTM",  # osteophytes, tibia (OARSI grades)
           "XRJSL", "XRJSM",  # joint-space narrowing (OARSI grades)
           "XRSCFL", "XRSCFM",  # sclerosis, femur (OARSI grades)
           "XRSCTL", "XRSCTM",  # sclerosis, tibia (OARSI grades)
           "XRATTL", "XRATTM",  # attrition (OARSI grades)
          ]
    df = df.loc[:, sel]
    return df


def preproc_xr_sq(df):
    for field in ("XRKL", ):
        #print(field, pd.unique(df[field]))
        dict_fix = {
            "P": 5,  # .P - data missing due to a prosthesis/knee replacement
            "T": -1,  # .T - data missing due to technical reasons (e.g. poor image quality)
            #.A - data not expected (e.g.: some V00XR...may be missing if participant has KLG<2 in both knees at all time points)
            np.nan: -1,
        }
        #df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})

    for field in ("XROSFL", "XROSFM", "XROSTL", "XROSTM",
                  "XRJSL", "XRJSM",
                  "XRSCFL", "XRSCFM", "XRSCTL", "XRSCTM",
                  "XRATTL", "XRATTM"):
        #print(field, pd.unique(df[field]))
        #df = df.fillna({field: -1}, axis=0)
        dict_fix = {"P": -1, "T": -1, "A": -1, "M": -1, np.nan: -1}
        df = df.replace({field: dict_fix})

    fields = ("XRKL",
              "XROSFL", "XROSFM", "XROSTL", "XROSTM",
              "XRSCFL", "XRSCFM", "XRSCTL", "XRSCTM",
              "XRATTL", "XRATTM")
    df = df.astype({f: int for f in fields})

    fields = ("XRJSL", "XRJSM")
    df = df.astype({f: float for f in fields})
    
    # Keep only one assessment for (subject, knee, follow-up)
    df = df.drop_duplicates(subset=["patient", "side", "visit"])

    return df

# Post-20210721
dir_xr_sq = Path(DIR_DATA_ROOT, "OAI_general/OAI_CompleteData_SAS")
paths_xr_sq = sorted(dir_xr_sq.glob("kxr_sq_bu??.sas7bdat"))

df_xr_sq = read_compose_xr_sq(paths_xr_sq)

df_xr_sq = preproc_xr_sq(df_xr_sq)
print(len(df_xr_sq))
df_xr_sq.head()

In [None]:
def read_compose_outcomes(paths):
    df = read_compose_asmts(paths)

    df.columns = df.columns.str.upper()
    df = df.rename(columns={"ID": "patient", })
    
    df = df.astype({"patient": str})
    
#     print(df.columns)
    sel = [
        "patient",
        "ELKVSRP",  # OAI visit follow-up TKR (left knee) self-reported at
        "ERKVSRP",  # OAI visit follow-up TKR (right knee) self-reported at
        
        "ELKXRAF",  # closest OAI visit with knee XR after follow-up TKR (left knee)
        "ERKXRAF",  # closest OAI visit with knee XR after follow-up TKR (right knee)
        
        "ELKXRPR",  # closest OAI visit with knee XR prior to follow-up TKR (left knee)
        "ERKXRPR",  # closest OAI visit with knee XR prior to follow-up TKR (right knee)
        
#         "ELKTLPR",  # total or partial follow-up knee replacement (left knee)
#         "ERKTLPR",  # total or partial follow-up knee replacement (right knee)
        
        "ELKBLRP",  # knee replacement (right knee) seen on baseline OAI XR
        "ERKBLRP",  # knee replacement (right knee) seen on baseline OAI XR
         
        "ELKRPSN",  # knee replacement (left knee) seen on follow-up OAI XR
        "ERKRPSN",  # knee replacement (right knee) seen on follow-up OAI XR
    ]
    df = df.loc[:, sel]
    return df


def preproc_outcomes(df):
    # Harmonize the values and fill the missing
    for field in ("ELKVSRP", "ERKVSRP",
                  "ELKXRAF", "ERKXRAF",
                  "ELKXRPR", "ERKXRPR",
                 ):
        dict_fix = {
            "0: Baseline": 0,
            "1: 12-month": 12,
            "2: 18-month": 18,
            "3: 24-month": 24,
            "4: 30-month": 30,
            "5: 36-month": 36,
            "6: 48-month": 48,
            "7: 60-month": 60,
            "8: 72-month": 72,
            "9: 84-month": 84,
            "10: 96-month": 96,
            "11: 108-month": 108,
            '.: Missing Form/Incomplete Workbook': -1,
            np.nan: -1, ' ': -1,
        }
        df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})
    
    for field in ("ELKBLRP", "ERKBLRP"):
        dict_fix = {
            "0: No": 0,
            "1: Yes": 1,            
            '.: Missing Form/Incomplete Workbook': -1,
        }
        df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})
    
    for field in ("ELKRPSN", "ERKRPSN"):
        dict_fix = {
            '.: Missing Form/Incomplete Workbook': -1,
            '0: No replacement seen on any FU xray': 0,
            '1: Yes, replacement seen on FU xray': 1,
            '2: No FU xrays of this knee (or hip)': 2,
        }
        df = df.fillna({field: -1}, axis=0)
        df = df.replace({field: dict_fix})
        
    # Melt "side"-specific columns
    df = pd.concat([df.assign(**{"side": "LEFT"}),
                    df.assign(**{"side": "RIGHT"})],
                    axis="index",
                    ignore_index=True)

    for f_left, f_right, f_out in [
        ("ELKVSRP", "ERKVSRP", "E-KVSRP"),
        ("ELKXRAF", "ERKXRAF", "E-KXRAF"),
        ("ELKXRPR", "ERKXRPR", "E-KXRPR"),
#         ("ELKTLPR", "ERKTLPR", "E-KTLPR"),
        ("ELKBLRP", "ERKBLRP", "E-KBLRP"),
        ("ELKRPSN", "ERKRPSN", "E-KRPSN"),
    ]:
        df = df.assign(**{f_out: ""})

        df.loc[df["side"] == "LEFT", f_out] = df[f_left]
        df.loc[df["side"] == "RIGHT", f_out] = df[f_right]

        df = df.astype({f_out: df[f_left].dtype})
        df = df.drop(columns=[f_left, f_right])
    # ---

    return df


dir_outcomes = Path(DIR_DATA_ROOT, "OAI_general/OAI_CompleteData_ASCII")
paths_outcomes = sorted(dir_outcomes.glob("Outcomes99.txt"))

df_outcomes = read_compose_outcomes(paths_outcomes)

df_outcomes = preproc_outcomes(df_outcomes)
print(len(df_outcomes))
df_outcomes.head()

In [None]:
def read_preproc_tiulpin2019(path):
    df = pd.read_csv(path)
    
    df = df.rename(columns={"ID": "patient",
                            "Side": "side",
                            "Prog_increase": "tiulpin2019_kl_diff",
                            "Progressor": "tiulpin2019_prog"})

    df = df.astype({"patient": str})
    
    for field in ("side",):
        dict_fix = {
            "L": "LEFT",
            "R": "RIGHT",
        }
        df = df.replace({field: dict_fix})

    df["visit"] = 0
    # ---
    sel = ["patient", "side", "visit", "tiulpin2019_kl_diff", "tiulpin2019_prog"]
    df = df.loc[:, sel]
    return df

path_tiulpin2019 = Path(DIR_DATA_ROOT, "tiulpin2019multimodal__labels.csv")

df_tiulpin2019 = read_preproc_tiulpin2019(path_tiulpin2019)

print(len(df_tiulpin2019))
df_tiulpin2019.head()

## 2. Merge the meta info

In [None]:
print(f"Records _enrollees: {len(df_enrollees)}")
print(f"Records _clinical: {len(df_clinical)}")
print(f"Records _xr_sq: {len(df_xr_sq)}")
print(f"Records _outcomes: {len(df_outcomes)}")
# print(f"Records _contents: {len(df_contents)}")
print()

df_t = df_enrollees.copy()
t_num_uniq_pat = len(df_t.drop_duplicates(subset=["patient",]))
print(f"Records merge (+enrollees): {len(df_t)}, "
      f"subjects: {t_num_uniq_pat}")

df_t = df_t.merge(df_clinical,
                  on="patient",
                  how="inner")
t_num_uniq_pat = len(df_t.drop_duplicates(subset=["patient",]))
t_num_uniq_knees = len(df_t.drop_duplicates(subset=["patient", "side"]))
print(f"Records merge (+clinical): {len(df_t)}, "
      f"subjects: {t_num_uniq_pat}, knees: {t_num_uniq_knees}")

df_t = df_t.merge(df_xr_sq,
                  on=["patient", "side", "visit", "visit_month", "PREFIX_VAR"],
                  how="inner")
t_num_uniq_pat = len(df_t.drop_duplicates(subset=["patient",]))
t_num_uniq_knees = len(df_t.drop_duplicates(subset=["patient", "side"]))
print(f"Records merge (+xr_sq): {len(df_t)}, "
      f"subjects: {t_num_uniq_pat}, knees: {t_num_uniq_knees}")

df_t = df_t.merge(df_outcomes,
                  on=["patient", "side"],
                  how="left")
t_num_uniq_pat = len(df_t.drop_duplicates(subset=["patient",]))
t_num_uniq_knees = len(df_t.drop_duplicates(subset=["patient", "side"]))
print(f"Records merge (+outcomes): {len(df_t)}, "
      f"subjects: {t_num_uniq_pat}, knees: {t_num_uniq_knees}")

df_merge = df_t
display(df_merge.head())

In [None]:
# Merge in prior art
print(f"Records _merge: {len(df_merge)}")
print(f"Records _tiulpin2019: {len(df_tiulpin2019)}")
print()

df_t = df_merge.copy()
print(f"Records merge: {len(df_t)}, "
      f"subjects: {len(pd.unique(df_t['patient']))}")

df_t = df_t.merge(df_tiulpin2019,
                  on=["patient", "side", "visit"],
                  how="left",
                  indicator="tiulpin2019_sel")
dict_fix = {"both": 1, "left_only": 0}  # omitted due to `left` merge - "right_only": 1, 
df_t = df_t.replace({"tiulpin2019_sel": dict_fix})
print(f"Records merge (+tiulpin2019): {len(df_t)}, "
      f"subjects: {len(pd.unique(df_t['patient']))}")

# Fill missing with -1 to ease downstream processing
for field in ("tiulpin2019_kl_diff", "tiulpin2019_prog"):
    df_t = df_t.replace({field: {" ": -1, np.nan: -1}})
    df_t = df_t.fillna({field: -1}, axis=0)
    df_t = df_t.astype({field: int})

df_merge = df_t
display(df_merge.head())

## 3. Select the subjects, define the targets

In [None]:
def crit_prog_kl_from_to_panfilov(d, visit_to, visit_from=0):
    """
    - worsening in KL grade, excluding KL0->KL1
    """
    indicator = True
    criterion = -1

    d = d.sort_values(by="visit", axis="index").copy()
    
    # Exclude missing records
    d = d[d["XRKL"] != -1]

    visits_avail = d["visit"].tolist()
    
    # Subject is present at baseline
    indicator &= (visit_from in visits_avail)
    if not indicator: return indicator, criterion, "0: not_present_at_baseline"
    
    # No KL=4 at baseline
    indicator &= d[d["visit"] == visit_from]["XRKL"].iloc[0] != 4
    if not indicator: return indicator, criterion, "1: KLG4_at_baseline"
    # No TKR at baseline
    indicator &= d[d["visit"] == visit_from]["XRKL"].iloc[0] != 5
    if not indicator: return indicator, criterion, "2: TKR_at_baseline"

    # Exclude TKR records
    d = d[d["XRKL"] != 5]

    # Merge KL0 and KL1
    d_m = d.copy()
    d_m.loc[d["XRKL"] == 0, "XRKL"] = 1

    # Consider interval
    sel_inter = (d_m["visit"] >= visit_from) & (d_m["visit"] <= visit_to)
    visits_inter = d_m["visit"][sel_inter].tolist()
    
    # At least 1 exam besides baseline
    indicator &= len(visits_inter) >= 2
    if not indicator: return indicator, criterion, "3: no_followups"
    
    # Persistent non-decrease of KL within interval
    t_i = d_m[sel_inter]
    indicator &= np.all(np.diff(t_i["XRKL"].to_numpy()) >= 0)
    if not indicator: return indicator, criterion, "4: KLG_decrease"

    if (visit_to in visits_inter) and \
        (d_m.loc[d["visit"] == visit_to, "XRKL"].values[0] == \
         d_m.loc[d["visit"] == visit_from, "XRKL"].values[0]):
        # Present and not progressed at end of interval
        indicator &= True
        criterion = 0
        reason = "9: ok"
    elif np.any(np.diff(t_i["XRKL"].to_numpy()) > 0):
        # Progressed before/at end of interval
        indicator &= True
        criterion = 1
        reason = "9: ok"
    else:
        indicator &= False
        criterion = -1
        reason = "5: insufficient_followups"
        
    return indicator, criterion, reason

In [None]:
# Create progressions labels
df_t = df_merge.copy()

dict_labels = defaultdict(list)

# t_iter = 0
for gb_name, gb_df in tqdm(df_t.groupby(["patient", "side"], sort=False)):
    dict_labels["patient"].append(gb_name[0])
    dict_labels["side"].append(gb_name[1])
    # Derived for baseline data only
    dict_labels["visit"].append(0)

    t_12 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=12, visit_from=0)
    t_24 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=24, visit_from=0)
    t_36 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=36, visit_from=0)
    t_48 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=48, visit_from=0)
    t_72 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=72, visit_from=0)
    t_96 = crit_prog_kl_from_to_panfilov(gb_df, visit_to=96, visit_from=0)

    dict_labels["panfilov_sel_kl_12"].append(int(t_12[0]))
    dict_labels["panfilov_sel_kl_24"].append(int(t_24[0]))
    dict_labels["panfilov_sel_kl_36"].append(int(t_36[0]))
    dict_labels["panfilov_sel_kl_48"].append(int(t_48[0]))
    dict_labels["panfilov_sel_kl_72"].append(int(t_72[0]))
    dict_labels["panfilov_sel_kl_96"].append(int(t_96[0]))
    
    dict_labels["prog_kl_12"].append(t_12[1])
    dict_labels["prog_kl_24"].append(t_24[1])
    dict_labels["prog_kl_36"].append(t_36[1])
    dict_labels["prog_kl_48"].append(t_48[1])
    dict_labels["prog_kl_72"].append(t_72[1])
    dict_labels["prog_kl_96"].append(t_96[1])
    
    dict_labels["reason_kl_12"].append(t_12[2])
    dict_labels["reason_kl_24"].append(t_24[2])
    dict_labels["reason_kl_36"].append(t_36[2])
    dict_labels["reason_kl_48"].append(t_48[2])
    dict_labels["reason_kl_72"].append(t_72[2])
    dict_labels["reason_kl_96"].append(t_96[2])
    
#     t_iter += 1
#     if t_iter > 10:
#         break

df_labels = pd.DataFrame.from_dict(dict_labels)

In [None]:
# Exclusions are in this order
exclusion_reasons_order = (
    "0: not_present_at_baseline",
    "1: KLG4_at_baseline",
    "2: TKR_at_baseline",
    "3: no_followups",
    "4: KLG_decrease",
    "5: insufficient_followups",
#     "9: ok",
)

In [None]:
target_reasons = (
    "reason_kl_12",
    "reason_kl_24",
    "reason_kl_36",
    "reason_kl_48",
    "reason_kl_72",
    "reason_kl_96",
)

for target_reason in target_reasons:
    display(f"---- {target_reason} ----")
    # Init df
    t_df = df_labels.copy()
    
    # Count the number of cases by reason
#     display(t_df.groupby(target_reason).count())
    display(t_df[target_reason].value_counts().sort_index(ascending=True))
    
    t_pat = len(t_df.drop_duplicates(subset=["patient",]))
    t_knees = len(t_df.drop_duplicates(subset=["patient", "side"]))
    print(f"Starting with': {t_pat} subjects, {t_knees} knees")
    
    # Count the number of subjects and knee after exclusion decrementally
    t_df_to_keep = t_df
    
    for reason in exclusion_reasons_order:
        t_df_to_excl = t_df_to_keep[t_df_to_keep[target_reason] == reason]
        t_pat_to_excl = len(t_df_to_excl.drop_duplicates(subset=["patient",]))
        t_knees_to_excl = len(t_df_to_excl.drop_duplicates(subset=["patient", "side"]))
        print(f"Excluded {t_pat_to_excl:>4d} subjects, {t_knees_to_excl:>4d} knees, as '{reason}'")
        
        t_df_to_keep = t_df_to_keep[t_df_to_keep[target_reason] != reason]
        t_pat_to_keep = len(t_df_to_keep.drop_duplicates(subset=["patient",]))
        t_knees_to_keep = len(t_df_to_keep.drop_duplicates(subset=["patient", "side"]))
        print(f"Remaining: {t_pat_to_keep:>4d} subjects, {t_knees_to_keep:>4d} knees")
    
    target = target_reason.replace("reason", "prog")
    #display(t_df_to_keep.groupby(target).count())
    
    t_df_pos = t_df_to_keep[t_df_to_keep[target] == 1]
    t_pos_pat = len(t_df_pos.drop_duplicates(subset=["patient",]))
    t_pos_knees = len(t_df_pos.drop_duplicates(subset=["patient", "side"]))
    
    t_df_neg = t_df_to_keep[t_df_to_keep[target] == 0]
    t_neg_pat = len(t_df_neg.drop_duplicates(subset=["patient",]))
    t_neg_knees = len(t_df_neg.drop_duplicates(subset=["patient", "side"]))
    
    print(f"Controls: {t_neg_pat:>4d} subjects, {t_neg_knees:>4d} knees")
    print(f"Cases: {t_pos_pat:>4d} subjects, {t_pos_knees:>4d} knees")

In [None]:
# Save labels to file
path_out = Path(DIR_DATA_ROOT, "prog_labels.csv")
df_labels.to_csv(path_out)

In [None]:
# Merge meta with labels
print(f"Records _merge: {len(df_merge)}")
print(f"Records _labels: {len(df_labels)}")
print()

df_t = df_merge.copy()
print(f"Records merge: {len(df_t)}, "
      f"subjects: {len(pd.unique(df_t['patient']))}")

df_t = df_t.merge(df_labels,
                  on=["patient", "side", "visit"],
                  how="left")
print(f"Records merge (+labels): {len(df_t)}, "
      f"subjects: {len(pd.unique(df_t['patient']))}")

# Fill missing with -1 to ease downstream processing
for field in (
    "panfilov_sel_kl_12", "panfilov_sel_kl_24", "panfilov_sel_kl_36",
    "panfilov_sel_kl_48", "panfilov_sel_kl_72", "panfilov_sel_kl_96",
    "prog_kl_12", "prog_kl_24", "prog_kl_36",
    "prog_kl_48", "prog_kl_72", "prog_kl_96",
):
    df_t = df_t.replace({field: {" ": -1, np.nan: -1}})
    df_t = df_t.fillna({field: -1}, axis=0)
    df_t = df_t.astype({field: int})
    
# display(df_t.head())
df_merge = df_t
display(df_merge.head())

In [None]:
# Save meta+labels to file
dir_out = Path(DIR_DATA_ROOT, "OAI_Clin_prep")
dir_out.mkdir(parents=True, exist_ok=True)
path_out = Path(dir_out, "meta_base.csv")
df_merge.to_csv(path_out)

# 4. Build index of imaging data

In [None]:
# Select the scans to be copied
df_t_asmts = df_merge.copy()
df_t_imgs = df_contents_proc.copy()

def print_selection_stats(df):
    print("Records: ", len(df), ", ",
          "knees: ", len(df.drop_duplicates(subset=["patient", "side"])), ", ",
          "subjects: ", len(pd.unique(df["patient"])))

print("Clinical / assessments:")
print_selection_stats(df_t_asmts)
# - Only the baseline scans
df_t_asmts = df_t_asmts[df_t_asmts["visit"] == 0]
print_selection_stats(df_t_asmts)

print("Imaging:")
df_t_imgs = df_t_imgs.sort_values(by=["patient", "side", "visit", "sequence"])
print_selection_stats(df_t_imgs)

# - Only the baseline scans
print("Only baseline:")
df_t_imgs = df_t_imgs[df_t_imgs["visit"] == 0]
print_selection_stats(df_t_imgs)

# - Only the following imaging protocols
# TODO: uncomment one at a time
t_sequences = [
    "SAG_3D_DESS",
#     "COR_IW_TSE",
#     "SAG_T2_MAP",       # Incidence Knee only
]
df_t_imgs = df_t_imgs[df_t_imgs["sequence"].isin(t_sequences)]
print_selection_stats(df_t_imgs)

# - In case of rescan, exclude all the previous scans
print("No rescans:")
df_t_imgs = df_t_imgs.drop_duplicates(
    subset=["patient", "visit", "side", "sequence"],
    keep="last",
    ignore_index=True)
print_selection_stats(df_t_imgs)


print("Intersect clinical and imaging:")
df_extract = df_t_imgs.copy()
# print("Intersect clinical and imaging:")
# selectors = ["patient", "visit_month", "visit", "side"]
# df_extract = df_t_asmts.merge(df_t_imgs, on=selectors, how="inner")
# print_selection_stats(df_extract)
# # - Only the knees with _all_ considered protocols
# df_extract = (
#     df_extract
#     .groupby(["patient", "side"], sort=False)
#     .filter(lambda x: all(e in x["sequence"].tolist() for e in t_sequences))
# )
print_selection_stats(df_extract)

# Save to a .csv file
# TODO: uncomment one at a time
path_out = "/home/egor/Workspace/p03_koafusion/meta_extract__sag_3d_dess.csv"
# path_out = "/home/egor/Workspace/p03_koafusion/meta_extract__cor_iw_tse.csv"
# path_out = "/home/egor/Workspace/p03_koafusion/meta_extract__sag_t2_map.csv"

df_extract.to_csv(path_out, index=False)
display(df_extract)

# 5. Extract individual modality data from OAI

In [None]:
NUM_SAMPLES_TO_COPY = None
# NUM_SAMPLES_TO_COPY = 4

# TODO: uncomment one at a time
SEQUENCES_TO_COPY = [
    "SAG_3D_DESS",
#     "COR_IW_TSE",
#     "SAG_T2_MAP",
]

df_to_copy = df_extract.copy()
print(f"Init, to copy: {len(df_to_copy)}")

# OPTION: Only one sequence to copy
df_to_copy = df_to_copy[df_to_copy["sequence"].isin(SEQUENCES_TO_COPY)]

# Select number of samples to copy
df_to_copy = df_to_copy.iloc[:NUM_SAMPLES_TO_COPY, :]
df_to_copy.head()

print(f"Selected, to copy: {len(df_to_copy)}")
display(df_to_copy.head())

In [None]:
# MR
def copy_scans_from_oai(path_root_source, path_root_target, df, n_jobs=1, dry_run=False):
    def silent_copy(p_from, p_to, dry_run):
        if not p_from.exists():
            print(f"Missing: {p_from}")
            return False
        else:
            if not dry_run:
                shutil.copytree(p_from, p_to)
            return True
        
    from joblib import delayed, Parallel
    
    tasks = []
    print(f"Total: {len(df)}")
    
    for _, r in df.iterrows():
        path_tmp_from = Path(path_root_source,
                             r['visit_month'][1:],
                             r['Folder'])
        path_tmp_to = Path(path_root_target,
                           r['visit_month'][1:],
                           r['Folder'])
        
        tasks.append(delayed(silent_copy)(path_tmp_from, path_tmp_to, dry_run))

    ret = Parallel(n_jobs=n_jobs, verbose=5)(t for t in tasks)
    # Exclude missing or erroneous data
    return df.loc[ret, :]

path_root_source = Path(DIR_DATA_ROOT, "OAIBaselineImages")
# TODO: uncomment one at a time
path_root_target = Path(DIR_DATA_ROOT, "OAI_SAG_3D_DESS_raw")
# path_root_target = Path(DIR_DATA_ROOT, "OAI_COR_IW_TSE_raw")
# path_root_target = Path(DIR_DATA_ROOT, "OAI_SAG_T2_MAP_raw")

os.makedirs(path_root_target, exist_ok=True)

df_copied = copy_scans_from_oai(path_root_source=path_root_source,
                                path_root_target=path_root_target,
                                df=df_to_copy,
                                n_jobs=4,
#                                 dry_run=True,
                               )

df_copied.to_csv(Path(path_root_target, 'meta_base.csv'))