## Combbine all diagnosis

This method take diagnosis from images, clinical, and diagnosis sheet, and creates one ground truth (where all three agree) and one majority vote (where two agree) diagnosis files. 

In [1]:
import pandas as pd
import math
clinical = pd.read_csv("ADSP_PHC_COGN.csv").rename(columns={"PHASE":"Phase"})
#this file is the metadata file that one can get from downloading MRI images from ADNI
img = pd.read_csv("metadata.csv")
comb = pd.read_csv("DXSUM_PDXCONV_ADNIALL.csv").rename(columns={"PHASE":"Phase"})
comb = comb[["RID", "PTID" , "Phase"]]

In [2]:
def read_diagnose(file_path: str = 'DXSUM_PDXCONV_ADNIALL.csv', verbose=False):
    # Read diagnostic summary
    diagnostic_summary = pd.read_csv(file_path, index_col='PTID').rename(columns={"PHASE":"Phase"})
    diagnostic_summary = diagnostic_summary.sort_values(by=["update_stamp"], ascending=True)
    # Create dictionary
    diagnostic_dict: dict = {}
    for key, data in diagnostic_summary.iterrows():
        # Iterate for each row of the document
        phase: str = data['Phase']
        diagnosis: float = -1.
        if phase == "ADNI1":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI2" or phase == "ADNIGO":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI3":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI4":
            diagnosis = data['DIAGNOSIS']
        else:
            print(f"ERROR: Not recognized study phase {phase}")
            exit(1)
        # Update dictionary
        if not math.isnan(diagnosis):
            diagnostic_dict[key] = diagnosis
    if verbose:
        print_diagnostic_dict_summary(diagnostic_dict)
    return diagnostic_dict


def print_diagnostic_dict_summary(diagnostic_dict: dict):
    print(f"Number of diagnosed patients: {len(diagnostic_dict.items())}\n")
    n_NL = 0
    n_MCI = 0
    n_AD = 0
    for (key, data) in diagnostic_dict.items():
        if data == 1:
            n_NL += 1
        if data == 2:
            n_MCI += 1
        if data == 3:
            n_AD += 1
    print(f"Number of NL patients: {n_NL}\n"
          f"Number of MCI patients: {n_MCI}\n"
          f"Number of AD patients: {n_AD}\n")

In [3]:
d = read_diagnose()
print_diagnostic_dict_summary(d)

Number of diagnosed patients: 3227

Number of NL patients: 1225
Number of MCI patients: 1069
Number of AD patients: 933



In [4]:
new = pd.DataFrame.from_dict(d, orient='index').reset_index()
print(new)

            index    0
0      011_S_0002  2.0
1      011_S_0003  3.0
2      011_S_0005  1.0
3      011_S_0008  2.0
4      022_S_0007  3.0
...           ...  ...
3222  016_S_10324  1.0
3223  114_S_10321  1.0
3224  123_S_10292  2.0
3225  035_S_10329  3.0
3226  082_S_10219  3.0

[3227 rows x 2 columns]


In [5]:
clinical.head()

Unnamed: 0,RID,SUBJECT_KEY,Phase,VISCODE,VISCODE2,EXAMDATE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,AGE,DX,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP,update_stamp
0,2,ADNI_011_S_0002,ADNI2,v06,m72,2011-09-19,1,16.0,2,5,80.4682,1.0,0.09,0.002,0.46,0.264,2021-04-14 13:05:55.0
1,2,ADNI_011_S_0002,ADNI2,v21,m96,2013-09-09,1,16.0,2,5,82.4422,1.0,0.288,-0.495,0.46,,2021-04-14 13:05:56.0
2,2,ADNI_011_S_0002,ADNI1,m36,m36,2008-08-27,1,16.0,2,5,77.4073,1.0,0.344,0.419,0.535,,2021-04-14 13:05:56.0
3,2,ADNI_011_S_0002,ADNI2,v41,m120,2015-09-22,1,16.0,2,5,84.4764,1.0,0.382,-0.549,0.595,-0.333,2021-04-14 13:05:56.0
4,2,ADNI_011_S_0002,ADNIGO,m60,m60,2010-09-22,1,16.0,2,5,79.4771,1.0,0.101,0.066,0.62,0.264,2021-04-14 13:05:56.0


In [6]:
clinical["year"] = clinical["EXAMDATE"].str[:4]

In [7]:
clinical["Subject"] = clinical["SUBJECT_KEY"].str.replace("ADNI_", "").str.replace("s", "S")

In [8]:
c = comb.merge(clinical, on = ["RID", "Phase"])

In [9]:
c = c.drop("Subject", axis =1)

In [10]:
c = c.rename(columns = {"PTID":"Subject"})

In [11]:
img["year"] = img["EXAMDATE"].str[5:].str.replace("/", "")

In [12]:
img = img.replace(["CN", "MCI", "AD"], [ 0, 1, 2])

In [13]:
c["DX"] = c["DX"] -1

In [14]:
new[0] = new[0].astype(int) -1
print(new)

            index  0
0      011_S_0002  1
1      011_S_0003  2
2      011_S_0005  0
3      011_S_0008  1
4      022_S_0007  2
...           ... ..
3222  016_S_10324  0
3223  114_S_10321  0
3224  123_S_10292  1
3225  035_S_10329  2
3226  082_S_10219  2

[3227 rows x 2 columns]


In [15]:
new = new.rename(columns = {"index":"Subject", 0:"GroupN"})
print(new)

          Subject  GroupN
0      011_S_0002       1
1      011_S_0003       2
2      011_S_0005       0
3      011_S_0008       1
4      022_S_0007       2
...           ...     ...
3222  016_S_10324       0
3223  114_S_10321       0
3224  123_S_10292       1
3225  035_S_10329       2
3226  082_S_10219       2

[3227 rows x 2 columns]


In [16]:
img = img.rename(columns = {"PTID":"Subject", "RECNO":"Group"})

In [17]:
m = new.merge(c, on = "Subject", how = "outer").merge(img, on = "Subject", how = "outer")
print(m)

            Subject  GroupN  RID_x  Phase      SUBJECT_KEY VISCODE_x  \
0        011_S_0002     1.0    2.0  ADNI1  ADNI_011_S_0002       m36   
1        011_S_0002     1.0    2.0  ADNI1  ADNI_011_S_0002        bl   
2        011_S_0002     1.0    2.0  ADNI1  ADNI_011_S_0002       m06   
3        011_S_0002     1.0    2.0  ADNI1  ADNI_011_S_0002       m36   
4        011_S_0002     1.0    2.0  ADNI1  ADNI_011_S_0002        bl   
...             ...     ...    ...    ...              ...       ...   
227139  941_S_10007     NaN    NaN    NaN              NaN       NaN   
227140  023_S_10126     NaN    NaN    NaN              NaN       NaN   
227141  037_S_10131     NaN    NaN    NaN              NaN       NaN   
227142  005_S_10240     NaN    NaN    NaN              NaN       NaN   
227143  305_S_10311     NaN    NaN    NaN              NaN       NaN   

       VISCODE2_x  EXAMDATE_x  PTGENDER  PTEDUCAT  ...  FLDSTRNGTH       ID  \
0             m36  2008-08-27       1.0      16.0  ...  

In [18]:
m[["GroupN", "DX", "Group"]]

Unnamed: 0,GroupN,DX,Group
0,1.0,0.0,
1,1.0,0.0,
2,1.0,0.0,
3,1.0,0.0,
4,1.0,0.0,
...,...,...,...
227139,,,
227140,,,
227141,,,
227142,,,


In [19]:
m = m[["Subject", "GroupN", "Group", "DX", "Phase"]].drop_duplicates()

In [20]:
m = m.dropna(subset = ["GroupN", "Group", "DX"], how="all").drop_duplicates()
m

Unnamed: 0,Subject,GroupN,Group,DX,Phase
0,011_S_0002,1.0,,0.0,ADNI1
9,011_S_0002,1.0,,0.0,ADNIGO
10,011_S_0002,1.0,,0.0,ADNI2
13,011_S_0002,1.0,,1.0,ADNI2
15,011_S_0002,1.0,,,ADNI2
...,...,...,...,...,...
227129,021_S_2296,,1.0,,
227130,128_S_2314,,1.0,,
227131,029_S_2370,,1.0,,
227132,022_S_2382,,1.0,,


In [21]:
m.loc[m["DX"].isna() & m["Group"].isna(), "Group"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]
m.loc[m["DX"].isna() & m["Group"].isna(), "DX"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]
m

Unnamed: 0,Subject,GroupN,Group,DX,Phase
0,011_S_0002,1.0,,0.0,ADNI1
9,011_S_0002,1.0,,0.0,ADNIGO
10,011_S_0002,1.0,,0.0,ADNI2
13,011_S_0002,1.0,,1.0,ADNI2
15,011_S_0002,1.0,1.0,,ADNI2
...,...,...,...,...,...
227129,021_S_2296,,1.0,,
227130,128_S_2314,,1.0,,
227131,029_S_2370,,1.0,,
227132,022_S_2382,,1.0,,


In [22]:
m1 = m[m["GroupN"] == m["Group"]]
m3 = m[m["GroupN"] == m["DX"]]
m4 = m[m["Group"] == m["DX"]]
m2 = m1[m1["Group"] == m1["DX"]]

In [23]:
m1 = m1[["Subject", "GroupN", "Group", "DX", "Phase"]]
m1

Unnamed: 0,Subject,GroupN,Group,DX,Phase
15,011_S_0002,1.0,1.0,,ADNI2
140,100_S_0015,1.0,1.0,0.0,ADNI1
265,100_S_0015,1.0,1.0,1.0,ADNI2
1259,100_S_0035,1.0,1.0,0.0,ADNI1
1384,100_S_0035,1.0,1.0,1.0,ADNIGO
...,...,...,...,...,...
227104,016_S_10324,0.0,0.0,,
227105,114_S_10321,0.0,0.0,,
227106,123_S_10292,1.0,1.0,,
227107,035_S_10329,2.0,2.0,,


In [24]:
m1.loc[m1["DX"].isna(), "DX"] = m1.loc[m1["DX"].isna(), "Group"]

In [25]:
m3 = m3[["Subject", "GroupN", "Group", "DX", "Phase"]]
m3

Unnamed: 0,Subject,GroupN,Group,DX,Phase
13,011_S_0002,1.0,,1.0,ADNI2
40,011_S_0002,1.0,,1.0,ADNI3
41,011_S_0003,2.0,,2.0,ADNI1
57,011_S_0005,0.0,,0.0,ADNI1
108,011_S_0008,1.0,,1.0,ADNI2
...,...,...,...,...,...
226471,013_S_6725,1.0,,1.0,ADNI3
226478,168_S_6851,1.0,,1.0,ADNI3
226500,035_S_6841,0.0,,0.0,ADNI3
226503,032_S_6855,2.0,,2.0,ADNI3


In [26]:
m3.loc[m3["Group"].isna(), "Group"] = m3.loc[m3["Group"].isna(), "GroupN"]

In [27]:
m4 = m4[["Subject", "GroupN", "Group", "DX", "Phase"]]
m4

Unnamed: 0,Subject,GroupN,Group,DX,Phase
265,100_S_0015,1.0,1.0,1.0,ADNI2
288,023_S_0030,2.0,1.0,1.0,ADNI1
1384,100_S_0035,1.0,1.0,1.0,ADNIGO
1455,067_S_0029,2.0,2.0,2.0,ADNI1
2175,023_S_0061,2.0,1.0,1.0,ADNI2
...,...,...,...,...,...
217385,051_S_5285,2.0,1.0,1.0,ADNI3
217707,100_S_2351,1.0,1.0,1.0,ADNIGO
217720,051_S_5294,1.0,1.0,1.0,ADNI3
217744,041_S_5026,2.0,1.0,1.0,ADNI2


In [28]:
m4[m4["GroupN"] != m4["DX"]]

Unnamed: 0,Subject,GroupN,Group,DX,Phase
288,023_S_0030,2.0,1.0,1.0,ADNI1
2175,023_S_0061,2.0,1.0,1.0,ADNI2
5156,131_S_0123,2.0,1.0,1.0,ADNI2
7250,032_S_0187,2.0,1.0,1.0,ADNI1
9170,136_S_0195,2.0,1.0,1.0,ADNI1
...,...,...,...,...,...
216850,002_S_4171,2.0,1.0,1.0,ADNI2
217361,051_S_5285,2.0,1.0,1.0,ADNI2
217385,051_S_5285,2.0,1.0,1.0,ADNI3
217744,041_S_5026,2.0,1.0,1.0,ADNI2


In [29]:
m2[["Subject", "GroupN", "Group", "DX", "Phase"]]

Unnamed: 0,Subject,GroupN,Group,DX,Phase
265,100_S_0015,1.0,1.0,1.0,ADNI2
1384,100_S_0035,1.0,1.0,1.0,ADNIGO
1455,067_S_0029,2.0,2.0,2.0,ADNI1
2495,067_S_0056,1.0,1.0,1.0,ADNI2
2605,067_S_0056,1.0,1.0,1.0,ADNI3
...,...,...,...,...,...
216784,135_S_5273,1.0,1.0,1.0,ADNI2
216986,032_S_5263,1.0,1.0,1.0,ADNI2
217063,027_S_5277,1.0,1.0,1.0,ADNI3
217707,100_S_2351,1.0,1.0,1.0,ADNIGO


In [30]:
m5 = pd.concat([m1,m3,m4])
i = m5[m5["Group"] == m5["GroupN"]]
i = i[i["Group"] == i["DX"]]

In [31]:
i = i.drop_duplicates()

In [32]:
i

Unnamed: 0,Subject,GroupN,Group,DX,Phase
15,011_S_0002,1.0,1.0,1.0,ADNI2
265,100_S_0015,1.0,1.0,1.0,ADNI2
1384,100_S_0035,1.0,1.0,1.0,ADNIGO
1455,067_S_0029,2.0,2.0,2.0,ADNI1
2495,067_S_0056,1.0,1.0,1.0,ADNI2
...,...,...,...,...,...
226471,013_S_6725,1.0,1.0,1.0,ADNI3
226478,168_S_6851,1.0,1.0,1.0,ADNI3
226500,035_S_6841,0.0,0.0,0.0,ADNI3
226503,032_S_6855,2.0,2.0,2.0,ADNI3


In [33]:
i[["Subject", "Group", "Phase"]].to_csv("ground_truth.csv")

In [34]:
m.update(m5[~m5.index.duplicated(keep='first')])

In [35]:
indexes = m.index

In [36]:
#if none of the three diagnosis agree, then we set the value to -1
m["GROUP"] = -1

In [37]:
for i in indexes:
    row = m.loc[i]
    if (row["GroupN"] == row["Group"]):
        val = row["GroupN"]
        
        m.loc[i, "GROUP"] = val
    elif (row["GroupN"] == row["DX"]):
        val = row["GroupN"]
        m.loc[i, "GROUP"] = val
        
    elif (row["Group"] == row["DX"]):
        val = row["Group"]
        m.loc[i, "GROUP"] = val
        

In [38]:
m5 = m5[~m5.index.duplicated(keep='first')]
m5

Unnamed: 0,Subject,GroupN,Group,DX,Phase
15,011_S_0002,1.0,1.0,1.0,ADNI2
140,100_S_0015,1.0,1.0,0.0,ADNI1
265,100_S_0015,1.0,1.0,1.0,ADNI2
1259,100_S_0035,1.0,1.0,0.0,ADNI1
1384,100_S_0035,1.0,1.0,1.0,ADNIGO
...,...,...,...,...,...
216850,002_S_4171,2.0,1.0,1.0,ADNI2
217361,051_S_5285,2.0,1.0,1.0,ADNI2
217385,051_S_5285,2.0,1.0,1.0,ADNI3
217744,041_S_5026,2.0,1.0,1.0,ADNI2


In [39]:
m[m["GROUP"] != -1]

Unnamed: 0,Subject,GroupN,Group,DX,Phase,GROUP
13,011_S_0002,1.0,1.0,1.0,ADNI2,1
15,011_S_0002,1.0,1.0,1.0,ADNI2,1
40,011_S_0002,1.0,1.0,1.0,ADNI3,1
41,011_S_0003,2.0,2.0,2.0,ADNI1,2
57,011_S_0005,0.0,0.0,0.0,ADNI1,0
...,...,...,...,...,...,...
227104,016_S_10324,0.0,0.0,0.0,,0
227105,114_S_10321,0.0,0.0,0.0,,0
227106,123_S_10292,1.0,1.0,1.0,,1
227107,035_S_10329,2.0,2.0,2.0,,2


In [40]:
m[["Subject", "GroupN", "Group", "DX", "GROUP", "Phase"]].to_csv("diagnosis_full.csv")