## Code to generate manifest.csv
- Based on this [schema](https://nipoppy.readthedocs.io/en/latest/schemas/index.html#manifest-file)

### QPN naming convention
- `visit`: timepoint of **ANY** clinical (i.e. UPDRS, MoCA, Neuropsy) data collected 
    - e.g. V01, V02 etc. 
- `session`: timepoint of MRI collection
    - e.g. ses-01, ses-02 etc.
- `event`: timepoint relative to a consensus baseline (used for inter-modality i.e. MRI vs clinical data harmonization) 
    - e.g. baseline, m06, m12 etc. 

In [None]:
import pandas as pd
import numpy as np

### Paths

In [None]:
releases_dir = "/home/nikhil/projects/Parkinsons/qpn/releases/"

previous_release = "June_2024"
current_release = "Oct_2024" # No new data yet (20 March 2024)

# Previous mr_proc manifest
previous_manifest_csv = f"{releases_dir}{previous_release}/manifest.csv"

# Current mr_proc manifest
current_manifest_csv = f"{releases_dir}{current_release}/manifest.csv"

# Current recruit manifest
current_recruit_manifest_xls = f"{releases_dir}{current_release}/tabular/recruitment/Suivi_RPQ.xlsx"

# Current DICOM list 
current_dicom_list_csv = f"{releases_dir}{current_release}/tabular/recruitment/DICOM_availability.csv"

### Read recruitment manifest from previous release

In [None]:
previous_recruit_manifest_df = pd.read_csv(previous_manifest_csv)
previous_recruit_manifest_df["participant_id"] = previous_recruit_manifest_df["participant_id"].str.strip()
nipoppy_participants_previous = previous_recruit_manifest_df["participant_id"].dropna().unique()
n_nipoppy_participants_previous = len(nipoppy_participants_previous)

print(f"number of participants from previous nipoppy release: {n_nipoppy_participants_previous}")
previous_recruit_manifest_df.head()

### Read latest recruitment manifest

In [None]:
col_range = "A:R"

col_rename_dict = {
    "subj_id":"participant_id",
    "IRM01\n(J-M-A)":"IRM01_date", "#IRM 1\n PD":"IRM01_PD", "#IRM 1\n CTRL":"IRM01_CTRL", 
    "# IRM 1\n RBD":"IRM01_RBD", "# IRM 1\nOTHER":"IRM01_OTHER",
    "IRM 2 \n(J-M-A)":"IRM02_date", "#IRM 2\n PD":"IRM02_PD", "#IRM 2\n CTRL":"IRM02_CTRL", 
    "# IRM 2\n RBD":"IRM02_RBD", "# IRM 2 OTHER":"IRM02_OTHER",
    "IRM 3\n(J-M-A)":"IRM03_date", "#IRM 3\n PD":"IRM03_PD", "#IRM 3\n CTRL":"IRM03_CTRL", 
    "# IRM 3\n RBD":"IRM03_RBD", "# IRM 3 OTHER":"IRM03_OTHER"
    }

useful_cols = col_rename_dict.values()

suivi_df = pd.read_excel(current_recruit_manifest_xls,sheet_name="En cours", engine='openpyxl', usecols=col_range)
suivi_df = suivi_df.rename(columns=col_rename_dict)[useful_cols].copy()

# remove the row with tally
suivi_df = suivi_df.drop([0])

# remove rows without participant_id
suivi_df = suivi_df.dropna(axis=0, subset=["participant_id"])
suivi_df = suivi_df[~suivi_df["participant_id"].astype(str).isin(["0"])] 
suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().astype(str)

# remove subjects without imaging data
suivi_df = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | 
                    (suivi_df["IRM01_RBD"] == 1) | (suivi_df["IRM01_OTHER"] == 1) |
                    (suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | 
                    (suivi_df["IRM02_RBD"] == 1) |(suivi_df["IRM02_OTHER"] == 1) |
                    (suivi_df["IRM03_PD"] == 1) | (suivi_df["IRM03_CTRL"] == 1) | 
                    (suivi_df["IRM03_RBD"] == 1) |(suivi_df["IRM03_OTHER"] == 1) ]


# fix participant_id formatting issues
# Some rows have Dx in participant_id and one participant with two IDs with "="
possible_delimiters = [" ", "(", "=", "\n"]
for delim in possible_delimiters:        
    suivi_df["participant_id"] = suivi_df["participant_id"].str.strip().str.split(pat=delim, n=1, expand=True)[0]

# nipoppy_participants_current
nipoppy_participants_current = suivi_df["participant_id"].dropna().unique()

suivi_df

### Set date columns and check visit order

In [None]:
# set date columns to datetime
# has mixed types, but auto formatting + coerce works fine here

# explicitely set 0 to nan to avoid origin issues (date: 0 is 1970-01-01)
suivi_df["IRM01_date"] = suivi_df["IRM01_date"].replace(0, np.nan)
suivi_df["IRM02_date"] = suivi_df["IRM02_date"].replace(0, np.nan)
suivi_df["IRM03_date"] = suivi_df["IRM03_date"].replace(0, np.nan)

suivi_df["IRM01_date"] = pd.to_datetime(suivi_df["IRM01_date"], errors="coerce")
suivi_df["IRM02_date"] = pd.to_datetime(suivi_df["IRM02_date"], errors="coerce")
suivi_df["IRM03_date"] = pd.to_datetime(suivi_df["IRM03_date"], errors="coerce")

# Check visit orders
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["IRM02_date"] - suivi_df["IRM01_date"]
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["visit_interval (V2-V1) in days"].dt.days
suivi_df["visit_interval (V3-V2) in days"] = suivi_df["IRM03_date"] - suivi_df["IRM02_date"]
suivi_df["visit_interval (V3-V2) in days"] = suivi_df["visit_interval (V3-V2) in days"].dt.days


visits_with_wrong_order_df = suivi_df[(suivi_df["visit_interval (V2-V1) in days"] < 0) | 
                                     (suivi_df["visit_interval (V3-V2) in days"] < 0)]


print(f"Participants with wrong visit order: ({len(visits_with_wrong_order_df)}) :{visits_with_wrong_order_df['participant_id'].values}")


## PD00849 has wrong visit order

In [None]:
suivi_df[suivi_df["participant_id"] == "PD00849"]

### Get new participants per session
This is based on a valid date for MRI visit

In [None]:
total_partcipants_additions = list(set(nipoppy_participants_current) - set(nipoppy_participants_previous))
print(f"number of new participants: {len(total_partcipants_additions)}")

MRI_v1_participants = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | 
                               (suivi_df["IRM01_RBD"] == 1) | (suivi_df["IRM01_OTHER"] == 1)]["participant_id"].dropna().unique()
MRI_v2_participants = suivi_df[(suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | 
                               (suivi_df["IRM02_RBD"] == 1) | (suivi_df["IRM02_OTHER"] == 1)]["participant_id"].dropna().unique()
MRI_v3_participants = suivi_df[(suivi_df["IRM03_PD"] == 1) | (suivi_df["IRM03_CTRL"] == 1) |
                                (suivi_df["IRM03_RBD"] == 1) | (suivi_df["IRM03_OTHER"] == 1)]["participant_id"].dropna().unique()


visit_participant_dict = {"MRI_v1": MRI_v1_participants, "MRI_v2": MRI_v2_participants, "MRI_v3": MRI_v3_participants}

print(f"MRI_v1_participants: {len(MRI_v1_participants)}, MRI_v2_participants: {len(MRI_v2_participants)}, MRI_v3_participants: {len(MRI_v3_participants)}")


### Check DICOM availability

In [None]:
current_dicom_df = pd.read_csv(current_dicom_list_csv)
current_dicom_df["participant_id"] = current_dicom_df["participant_id"].str.strip().astype(str)
n_dicom_participants = len(current_dicom_df["participant_id"].dropna().unique())
print(f"number of participants with dicom data: {n_dicom_participants}")
n_dicom_sessions = len(current_dicom_df["session"].dropna().unique())
print(f"number of sessions with dicom data: {n_dicom_sessions}")

dicom_ses_1_participants = current_dicom_df[current_dicom_df["session"] == "ses-01"]["participant_id"].dropna().unique()
dicom_ses_2_participants = current_dicom_df[current_dicom_df["session"] == "ses-02"]["participant_id"].dropna().unique()
dicom_ses_3_participants = current_dicom_df[current_dicom_df["session"] == "ses-03"]["participant_id"].dropna().unique()

print(f"dicom_ses_1_participants: {len(dicom_ses_1_participants)}, dicom_ses_2_participants: {len(dicom_ses_2_participants)}, dicom_ses_3_participants: {len(dicom_ses_3_participants)}")

current_dicom_df.head()

In [None]:
current_dicom_df[current_dicom_df["participant_dicom_dir"].isna()]

In [None]:
suivi_dicom_intersection = set(dicom_ses_1_participants) & set(MRI_v1_participants)
suivi_minus_dicom = set(MRI_v1_participants) - set(dicom_ses_1_participants)  
dicom_minus_suivi = set(dicom_ses_1_participants) - set(MRI_v1_participants)
print(f"suivi_dicom_intersection: {len(suivi_dicom_intersection)}, suivi_minus_dicom: {len(suivi_minus_dicom)}, dicom_minus_suivi: {len(dicom_minus_suivi)}")

print(f"suivi_minus_dicom: {suivi_minus_dicom}")
print(f"dicom_minus_suivi: {dicom_minus_suivi}")

### Generate current_manifest_df

- Populate manifest with available clinical visits / MRI sessions
- Datatypes: Assuming QPN has all 4 BIDS datatypes: ["anat","dwi","fmap","func"]

- Sample `manifest.csv`

| participant_id | participant_dicom_dir | visit | session | datatype                     | bids_id |
|----------------|-----------------------|-------|---------|------------------------------|---------|
| 001            | MyStudy_001_V2021      | V01   | ses-01  | ["anat","dwi","fmap","func"] | sub-001 |
| 001            | MyStudy_001_V2022      | V02   | ses-02  | ["anat"]                     | sub-001 |
| 002            | MyStudy_002_V2021      | V01   | ses-01  | ["anat","dwi"]               | sub-002 |
| 002            | MyStudy_002_V2024      | V03   | ses-03  | ["anat","dwi"]               | sub-002 |

In [None]:
visit_labels = visit_participant_dict.keys()
visit_session_dict = {"MRI_v1": "ses-01", "MRI_v2": "ses-02", "MRI_v3": "ses-03"}

manifest_cols = ["visit","session","datatype","dicom_availabilty"]
avail_datatypes = "['anat','dwi','fmap','func']"

current_manifest_df = pd.DataFrame()

for visit_label in visit_labels:
    session = visit_session_dict[visit_label]

    visit_participant_ids = visit_participant_dict[visit_label]
    participants_with_dicoms = current_dicom_df[current_dicom_df["session"] == session]["participant_id"].values

    # add participants with dicoms to visit_participant_ids (sometimes Suivi is out of sync with dicom data)
    visit_participant_ids = list(set(visit_participant_ids).union(set(participants_with_dicoms)))
                                 
    print(f"visit_id: {visit_label}, n_participants: {len(visit_participant_ids)}")

    _df = pd.DataFrame(index=visit_participant_ids, columns=manifest_cols)

    _df.loc[visit_participant_ids,"visit"] = visit_label
    _df.loc[visit_participant_ids,"session"] = session
    _df.loc[visit_participant_ids,"datatype"] = avail_datatypes

    # check dicom availability
    _df.loc[participants_with_dicoms,"dicom_availabilty"] = "yes"
    _df["dicom_availabilty"] = _df["dicom_availabilty"].fillna("no")
    
    current_manifest_df = pd.concat([current_manifest_df, _df], axis=0)

current_manifest_df

### Save update CSV

In [None]:
save_current_manifest = False
if save_current_manifest:
    print(f"Saving new nipoppy manifest here: {current_manifest_csv}")
    current_manifest_df = current_manifest_df.reset_index().rename(columns={"index":"participant_id"})
    current_manifest_df.to_csv(current_manifest_csv,index=None)

## Generate inventory of all BIDS data

In [None]:
from bids import BIDSLayout

In [None]:
qpn_bids_dir = "/home/nikhil/projects/Parkinsons/qpn/bids"
layout = BIDSLayout(qpn_bids_dir)

In [None]:
layout_df = layout.to_df()
layout_df[layout_df["extension"] == ".nii.gz"]

### Sanity check for QPN BIDS data

In [None]:
tabular_data_release_dir = "/home/nikhil/projects/Parkinsons/qpn/releases/Oct_2024/tabular/"
bids_layout_export_file = f"{tabular_data_release_dir}/bids_layout_export.csv"
demographics_file = f"{tabular_data_release_dir}/demographics.csv"

bids_layout_df = pd.read_csv(bids_layout_export_file)

n_participants = len(bids_layout_df["subject"].dropna().unique())
print(f"number of participants in BIDS layout: {n_participants}")

n_sessions = len(bids_layout_df["session"].dropna().unique())
print(f"number of sessions in BIDS layout: {n_sessions}")

## Filter QPN subjects
demo_df = pd.read_csv(demographics_file)
qpn_participants = demo_df[demo_df["recruitment_cohort"] == "QPN"]["participant_id"].unique()

n_qpn_participants = len(qpn_participants)
print(f"number of QPN participants: {n_qpn_participants}")

bids_layout_df = bids_layout_df[bids_layout_df["subject"].isin(qpn_participants)]

n_participants = len(bids_layout_df["subject"].dropna().unique())
print(f"number of participants in BIDS layout: {n_participants}")

n_sessions = len(bids_layout_df["session"].dropna().unique())
print(f"number of sessions in BIDS layout: {n_sessions}")

# save filtered bids layout
bids_layout_df.to_csv(f"{tabular_data_release_dir}/bids_layout_export_qpn.csv", index=None)

bids_layout_df.head()