### Notebook to collate the normed data from different assessments in neuropsych battery

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

In [None]:
dataset_dir = "/home/nikhil/projects/Parkinsons/qpn"

# manifest
tabular_dir = f"{dataset_dir}/tabular"
manifest_file = f"{tabular_dir}/manifest.csv"

# neuropysch
neuropych_dir = f"{tabular_dir}/assessments/neuropysch/RPQ_neuropsych_norming/"
neuropych_json_dir = f"{neuropych_dir}/JSONs"
normed_scores_dir = f"{neuropych_dir}/normed_scores"

participant_id_col = "Patient #"


### Read manifest (available subjects)

In [None]:
manifest = pd.read_csv(manifest_file)
qpn_participants = manifest["participant_id"].unique()
sessions = manifest["session"].unique()

print(f"n_participants: {len(qpn_participants)}, unique sessions: {sessions}")
manifest.head()

### Check available normed assessments and their config

### Fixed issues:
"normed_data" path for:

    i. Stroop_DKefs_Cond_3_INK_Time_sec.xlsx --> Stroop_DKefs_Cond_3_INK_Time_Normed.xlsx
    ii. Stroop_DKefs_Cond_1_COLORS_Time_sec.xlsx --> Stroop_DKefs_Cond_1_COLORS_Time_Normed.xlsx
    iii. Stroop_DKefs_Cond_3_Total_errors.xlsx --> Stroop_DKefs_Cond_3_Total_errors_Normed.xlsx


In [None]:
json_files = os.listdir(neuropych_json_dir)
print(f"n_json_files: {len(json_files)}")

In [None]:
def get_assessment_info(json_path):
    # json_file = os.path.basename(json_path)
    info = pd.read_json(json_path)
    instrument = info["instrument"]
    norming_procedure = instrument["norming_procedure"]
    
    raw_score_col = instrument["raw_score_name"]
    normed_score_col = instrument["normed_score_name"]
    
    data_paths = info["data_paths"]
    normed_data_path = data_paths["normed_data"]
    normed_file_name = os.path.basename(normed_data_path)

    return raw_score_col, normed_score_col, normed_file_name, norming_procedure

In [None]:
# Fix participant ids
participant_id_replace_dict = {"PD00119/T1":"PD00119"}
participant_id_drop_list = ["PD00119/T2","PD00"]
# collate normed scores
scores_df_list = []
for json_file in json_files:
    json_path = f"{neuropych_json_dir}/{json_file}"
    print(f"assessment: {json_file}")
    if json_file in ["TMT_AB_contrast_config.json"]:
        print(f"Ignoring contrast instrument with two raw score cols: {json_file}")
    else:
        raw_score_col, normed_score_col, normed_file_name, norming_procedure = get_assessment_info(json_path)
        normed_data_file = f"{normed_scores_dir}/{normed_file_name}"
        _df = pd.read_excel(f"{normed_data_file}")
        # Fix participant id
        _df[participant_id_col] = _df[participant_id_col].replace(participant_id_replace_dict)
        _df = _df[~_df[participant_id_col].isin(participant_id_drop_list)]
        _df = _df[[participant_id_col, raw_score_col, normed_score_col]]
        
        ## drop duplicates (keep first i.e. baseline assessment for each participant)
        _df = _df.drop_duplicates(subset=[participant_id_col],keep="first")
        _df["norming_procedure"] = norming_procedure
        _df[participant_id_col] = _df[participant_id_col].astype(str).str.strip()
        _df = _df.rename(columns={normed_score_col: "normed_score", raw_score_col: "raw_score"})
        _df["assessment"] = json_file.split(".")[0]
        scores_df_list.append(_df)
        
scores_df = pd.concat(scores_df_list, axis=0)

scores_df.head()
    

### Plots

In [None]:
plot_df = pd.melt(scores_df, id_vars=[participant_id_col, "assessment", "norming_procedure"], 
                  value_vars=["normed_score", "raw_score"], value_name="score", var_name="score_type")
plot_df = plot_df.sort_values(by=["score_type", "norming_procedure", "assessment"])
col_order = ["raw_score","normed_score"]
sns.set(font_scale=1.5)
with sns.axes_style("whitegrid"):
    g = sns.catplot(x="score" ,y="assessment", col="score_type", hue="norming_procedure", palette="Set1",  
                    data=plot_df, col_order=col_order, 
                    height=10, kind="strip", sharex=False)


### Find missing participants per assessment

In [None]:
score_availability_df = scores_df.groupby(["assessment"]).count().reset_index().drop(columns=["norming_procedure"])
score_availability_df = score_availability_df.rename(columns={"Patient #": "recruitment_count",
                                                              "raw_score": "raw_score_count",
                                                              "normed_score": "normed_score_count"})

score_availability_df.head()

In [None]:
raw_score_wide_df = scores_df.pivot(index=participant_id_col, columns="assessment", values="raw_score")
normed_score_wide_df = scores_df.pivot(index=participant_id_col, columns="assessment", values="normed_score")

In [None]:
scores_df.to_csv(f"{neuropych_dir}/collated_scores_longform.csv", index=False)
raw_score_wide_df.to_csv(f"{neuropych_dir}/raw_score_wideform.csv", index=True)
normed_score_wide_df.to_csv(f"{neuropych_dir}/normed_score_wideform.csv", index=True)
score_availability_df.to_csv(f"{neuropych_dir}/score_availability.csv", index=False)

### Compare with manifest

In [None]:
neurospych_participants = scores_df[participant_id_col].unique()
print(f"n_neurospych_participants: {len(neurospych_participants)}")
scores_df.head()

In [None]:
neuropsy_and_qpn = set(neurospych_participants ) & set(qpn_participants)
not_in_qpn = set(neurospych_participants ) - set(qpn_participants)
not_in_neuropsy = set(qpn_participants) - set(neurospych_participants )

print(f"n_neuropsy_and_qpn: {len(neuropsy_and_qpn)}, n_not_in_qpn: {len(not_in_qpn)}, n_not_in_neuropsy: {len(not_in_neuropsy)}")