In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Penn Observer - Neuro Subset

In [0]:
penn_neuro_pts = pd.read_excel("../data/cognitive_test_scores.xlsx")
penn_neuro_pts = penn_neuro_pts.rename(columns={"MMSE": "mmse"})
penn_neuro_pts["Label"] = pd.NA
penn_neuro_pts.shape

In [0]:
bins = np.linspace(0, 30, 15)

plt.hist(penn_neuro_pts["mmse"], bins=bins, edgecolor="k")
plt.title("MMSE Scores for Penn Observer Cohort")
plt.xlabel("MMSE Score")
plt.ylabel("Frequency")
plt.xlim([-1, 31])
plt.grid()
plt.show()

### ADReSS

In [0]:
ctrl_grp = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSS-IS2020/train/cc_meta_data.txt", sep=r"\s*;\s*", engine="python")
expr_grp = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSS-IS2020/train/cd_meta_data.txt", sep=r"\s*;\s*", engine="python")

ctrl_grp["Label"] = 0
expr_grp["Label"] = 1

adress_trn_pts = pd.concat((ctrl_grp, expr_grp))
adress_trn_pts = adress_trn_pts.set_index("ID")
adress_trn_pts.shape

In [0]:
adress_tst_pts = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSS-IS2020/test/meta_data.txt", sep=r"\s*;\s*", engine="python")
adress_tst_pts = adress_tst_pts.set_index("ID")
adress_tst_pts.shape

In [0]:
bins = np.linspace(0, 30, 15)

plt.hist([adress_trn_pts["mmse"], adress_tst_pts["mmse"]], bins=bins, label=["Train", "Test"], edgecolor="k")
plt.title("MMSE Scores for ADReSS Dataset")
plt.xlabel("MMSE Score")
plt.ylabel("Frequency")
plt.legend(loc="upper left")
plt.xlim([-1, 31])
plt.grid()
plt.show()

### ADReSSo

In [0]:
adresso_trn_pts = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSSo-IS2021/diagnosis/train/adresso-train-mmse-scores-diagnoses.csv")
adresso_trn_pts = adresso_trn_pts.rename(columns={"Unnamed: 0": "ID Number", "adressfname": "ID"})
adresso_trn_pts["Label"] = adresso_trn_pts["dx"].map({"cn": 0, "ad": 1})
adresso_trn_pts.shape

In [0]:
mmse_scores = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSSo-IS2021/diagnosis/test/adresso-test-mmse-scores.csv")
diagnoses = pd.read_csv("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/adresso/ADReSSo-IS2021/diagnosis/test/adresso-test-diagnoses.csv")

adresso_tst_pts = pd.merge(mmse_scores, diagnoses, on="ID", how="inner")
adresso_tst_pts = adresso_tst_pts.rename(columns={"MMSE": "mmse", "Dx": "dx"})
adresso_tst_pts["Label"] = adresso_tst_pts["dx"].map({"Control": 0, "ProbableAD": 1})
adresso_tst_pts.shape

In [0]:
bins = np.linspace(0, 30, 15)

plt.hist([adresso_trn_pts["mmse"], adresso_tst_pts["mmse"]], bins=bins, label=["Train", "Test"], edgecolor="k")
plt.title("MMSE Scores for ADReSSo Dataset")
plt.xlabel("MMSE Score")
plt.ylabel("Frequency")
plt.legend(loc="upper left")
plt.xlim([-1, 31])
plt.grid()
plt.show()

### Summary

In [0]:
bins = np.linspace(0, 30, 15)

plt.figure(figsize=(12, 6))  
plt.hist(
    [penn_neuro_pts["mmse"], adress_trn_pts["mmse"], adress_tst_pts["mmse"], adresso_trn_pts["mmse"], adresso_tst_pts["mmse"]], 
    bins=bins, 
    label=["Penn Observer", "ADReSS-train", "ADReSS-test", "ADReSSo-train", "ADReSSo-test"], 
    edgecolor="k"
)
plt.title("MMSE Scores per Dataset")
plt.xlabel("MMSE Score")
plt.ylabel("Frequency")
plt.legend(loc="upper left")
plt.xlim([-1, 31])
plt.grid()
plt.show()

In [0]:
def summarize(df):
    return {
                "n": df.shape[0],
                "pct AD": df["Label"].sum() / df.shape[0],
                "avg MMSE": df["mmse"].mean(),
                "std MMSE": df["mmse"].std(),
                "min MMSE": int(df["mmse"].min()),
                "max MMSE": int(df["mmse"].max())
            }

summ = list(map(summarize, [penn_neuro_pts, adress_trn_pts, adress_tst_pts, adresso_trn_pts, adresso_tst_pts]))

summary = pd.DataFrame(summ, index=["Penn Observer", "ADReSS-train", "ADReSS-test", "ADReSSo-train", "ADReSSo-test"])
summary.round(2)