## Combbine all diagnosis

This method take diagnosis from images, clinical, and diagnosis sheet, and creates one ground truth (where all three agree) and one majority vote (where two agree) diagnosis files. 

In [1]:
import pandas as pd
import math
clinical = pd.read_csv("ADSP_PHC_COGN.csv").rename(columns={"PHASE":"Phase"})
#this file is the metadata file that one can get from downloading MRI images from ADNI
img = pd.read_csv("metadata.csv")
comb = pd.read_csv("DXSUM_PDXCONV_ADNIALL.csv").rename(columns={"PHASE":"Phase"})
comb = comb[["RID", "PTID" , "Phase"]]

In [2]:
def read_diagnose(file_path: str = 'DXSUM_PDXCONV_ADNIALL.csv', verbose=False):
    # Read diagnostic summary
    diagnostic_summary = pd.read_csv(file_path, index_col='PTID').rename(columns={"PHASE":"Phase"})
    diagnostic_summary = diagnostic_summary.sort_values(by=["update_stamp"], ascending=True)
    # Create dictionary
    diagnostic_dict: dict = {}
    for key, data in diagnostic_summary.iterrows():
        # Iterate for each row of the document
        phase: str = data['Phase']
        diagnosis: float = -1.
        if phase == "ADNI1":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI2" or phase == "ADNIGO":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI3":
            diagnosis = data['DIAGNOSIS']
        elif phase == "ADNI4":
            diagnosis = data['DIAGNOSIS']
        else:
            print(f"ERROR: Not recognized study phase {phase}")
            exit(1)
        # Update dictionary
        if not math.isnan(diagnosis):
            diagnostic_dict[key] = diagnosis
    if verbose:
        print_diagnostic_dict_summary(diagnostic_dict)
    return diagnostic_dict


def print_diagnostic_dict_summary(diagnostic_dict: dict):
    print(f"Number of diagnosed patients: {len(diagnostic_dict.items())}\n")
    n_NL = 0
    n_MCI = 0
    n_AD = 0
    for (key, data) in diagnostic_dict.items():
        if data == 1:
            n_NL += 1
        if data == 2:
            n_MCI += 1
        if data == 3:
            n_AD += 1
    print(f"Number of NL patients: {n_NL}\n"
          f"Number of MCI patients: {n_MCI}\n"
          f"Number of AD patients: {n_AD}\n")

In [3]:
d = read_diagnose()
print_diagnostic_dict_summary(d)

Number of diagnosed patients: 3227

Number of NL patients: 1225
Number of MCI patients: 1069
Number of AD patients: 933



In [4]:
new = pd.DataFrame.from_dict(d, orient='index').reset_index()
print(new)

            index    0
0      011_S_0002  2.0
1      011_S_0003  3.0
2      011_S_0005  1.0
3      011_S_0008  2.0
4      022_S_0007  3.0
...           ...  ...
3222  016_S_10324  1.0
3223  114_S_10321  1.0
3224  123_S_10292  2.0
3225  035_S_10329  3.0
3226  082_S_10219  3.0

[3227 rows x 2 columns]


In [5]:
clinical.head()

Unnamed: 0,RID,SUBJECT_KEY,Phase,VISCODE,VISCODE2,EXAMDATE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,AGE,DX,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP,update_stamp
0,2,ADNI_011_S_0002,ADNI2,v06,m72,2011-09-19,1,16.0,2,5,80.4682,1.0,0.09,0.002,0.46,0.264,2021-04-14 13:05:55.0
1,2,ADNI_011_S_0002,ADNI2,v21,m96,2013-09-09,1,16.0,2,5,82.4422,1.0,0.288,-0.495,0.46,,2021-04-14 13:05:56.0
2,2,ADNI_011_S_0002,ADNI1,m36,m36,2008-08-27,1,16.0,2,5,77.4073,1.0,0.344,0.419,0.535,,2021-04-14 13:05:56.0
3,2,ADNI_011_S_0002,ADNI2,v41,m120,2015-09-22,1,16.0,2,5,84.4764,1.0,0.382,-0.549,0.595,-0.333,2021-04-14 13:05:56.0
4,2,ADNI_011_S_0002,ADNIGO,m60,m60,2010-09-22,1,16.0,2,5,79.4771,1.0,0.101,0.066,0.62,0.264,2021-04-14 13:05:56.0


In [6]:
clinical["year"] = clinical["EXAMDATE"].str[:4]

In [7]:
clinical["Subject"] = clinical["SUBJECT_KEY"].str.replace("ADNI_", "").str.replace("s", "S")

In [8]:
c = comb.merge(clinical, on = ["RID", "Phase"])

In [9]:
c = c.drop("Subject", axis =1)

In [10]:
c = c.rename(columns = {"PTID":"Subject"})

In [11]:
img["year"] = img["EXAMDATE"].str[5:].str.replace("/", "")

In [12]:
img = img.replace(["CN", "MCI", "AD"], [ 0, 1, 2])

In [13]:
c["DX"] = c["DX"] -1

In [14]:
new[0] = new[0].astype(int) -1
print(new)

            index  0
0      011_S_0002  1
1      011_S_0003  2
2      011_S_0005  0
3      011_S_0008  1
4      022_S_0007  2
...           ... ..
3222  016_S_10324  0
3223  114_S_10321  0
3224  123_S_10292  1
3225  035_S_10329  2
3226  082_S_10219  2

[3227 rows x 2 columns]


In [15]:
new = new.rename(columns = {"index":"Subject", 0:"GroupN"})
print(new)

          Subject  GroupN
0      011_S_0002       1
1      011_S_0003       2
2      011_S_0005       0
3      011_S_0008       1
4      022_S_0007       2
...           ...     ...
3222  016_S_10324       0
3223  114_S_10321       0
3224  123_S_10292       1
3225  035_S_10329       2
3226  082_S_10219       2

[3227 rows x 2 columns]


In [16]:
img = img.rename(columns = {"PTID":"Subject", "RECNO":"Group"})

In [17]:
m = new.merge(c, on = "Subject", how = "outer")
print(m)

          Subject  GroupN    RID  Phase SUBJECT_KEY VISCODE VISCODE2  \
0      002_S_0295     0.0  295.0  ADNI1         NaN      bl       bl   
1      002_S_0295     0.0  295.0  ADNI1         NaN     m06      m06   
2      002_S_0295     0.0  295.0  ADNI1         NaN     m12      m12   
3      002_S_0295     0.0  295.0  ADNI1         NaN     m24      m24   
4      002_S_0295     0.0  295.0  ADNI1         NaN     m36      m36   
...           ...     ...    ...    ...         ...     ...      ...   
50976  941_S_7074     0.0    NaN    NaN         NaN     NaN      NaN   
50977  941_S_7085     1.0    NaN    NaN         NaN     NaN      NaN   
50978  941_S_7087     0.0    NaN    NaN         NaN     NaN      NaN   
50979  941_S_7091     0.0    NaN    NaN         NaN     NaN      NaN   
50980  941_S_7106     1.0    NaN    NaN         NaN     NaN      NaN   

         EXAMDATE  PTGENDER  PTEDUCAT  PTETHCAT  PTRACCAT      AGE   DX  \
0      2006-05-09       1.0      18.0       2.0       5.0  8

In [18]:
m[["GroupN", "DX"]]

Unnamed: 0,GroupN,DX
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
50976,0.0,
50977,1.0,
50978,0.0,
50979,0.0,


In [19]:
m = m[["Subject", "GroupN", "DX", "Phase"]].drop_duplicates()

In [20]:
m = m.dropna(subset = ["GroupN", "DX"], how="all").drop_duplicates()
m

Unnamed: 0,Subject,GroupN,DX,Phase
0,002_S_0295,0.0,0.0,ADNI1
36,002_S_0295,0.0,0.0,ADNI2
40,002_S_0413,0.0,0.0,ADNI1
76,002_S_0413,0.0,0.0,ADNI2
101,002_S_0413,0.0,0.0,ADNI3
...,...,...,...,...
50976,941_S_7074,0.0,,
50977,941_S_7085,1.0,,
50978,941_S_7087,0.0,,
50979,941_S_7091,0.0,,


In [22]:

m

Unnamed: 0,Subject,GroupN,DX,Phase
0,002_S_0295,0.0,0.0,ADNI1
36,002_S_0295,0.0,0.0,ADNI2
40,002_S_0413,0.0,0.0,ADNI1
76,002_S_0413,0.0,0.0,ADNI2
101,002_S_0413,0.0,0.0,ADNI3
...,...,...,...,...
50976,941_S_7074,0.0,,
50977,941_S_7085,1.0,,
50978,941_S_7087,0.0,,
50979,941_S_7091,0.0,,


In [24]:

m3 = m[m["GroupN"] == m["DX"]]


In [26]:
m3 = m3[["Subject", "GroupN", "DX", "Phase"]]
m3

Unnamed: 0,Subject,GroupN,DX,Phase
0,002_S_0295,0.0,0.0,ADNI1
36,002_S_0295,0.0,0.0,ADNI2
40,002_S_0413,0.0,0.0,ADNI1
76,002_S_0413,0.0,0.0,ADNI2
101,002_S_0413,0.0,0.0,ADNI3
...,...,...,...,...
50950,941_S_6580,0.0,0.0,ADNI3
50960,941_S_6581,0.0,0.0,ADNI3
50964,941_S_6607,0.0,0.0,ADNI3
50965,941_S_6803,1.0,1.0,ADNI3


In [27]:
m5 = m3
i = m5

In [28]:
i = i.drop_duplicates()

In [29]:
i

Unnamed: 0,Subject,GroupN,DX,Phase
0,002_S_0295,0.0,0.0,ADNI1
36,002_S_0295,0.0,0.0,ADNI2
40,002_S_0413,0.0,0.0,ADNI1
76,002_S_0413,0.0,0.0,ADNI2
101,002_S_0413,0.0,0.0,ADNI3
...,...,...,...,...
50950,941_S_6580,0.0,0.0,ADNI3
50960,941_S_6581,0.0,0.0,ADNI3
50964,941_S_6607,0.0,0.0,ADNI3
50965,941_S_6803,1.0,1.0,ADNI3


In [30]:
i[["Subject", "GroupN", "Phase"]].to_csv("ground_truth.csv")

In [None]:
m.update(m5[~m5.index.duplicated(keep='first')])

In [38]:
indexes = m.index

In [35]:
#if none of the three diagnosis agree, then we set the value to -1
print(m)
m["GROUP"] = -1

          Subject  GroupN   DX  Phase  GROUP
0      002_S_0295     0.0  0.0  ADNI1     -1
36     002_S_0295     0.0  0.0  ADNI2     -1
40     002_S_0413     0.0  0.0  ADNI1     -1
76     002_S_0413     0.0  0.0  ADNI2     -1
101    002_S_0413     0.0  0.0  ADNI3     -1
...           ...     ...  ...    ...    ...
50976  941_S_7074     0.0  NaN    NaN     -1
50977  941_S_7085     1.0  NaN    NaN     -1
50978  941_S_7087     0.0  NaN    NaN     -1
50979  941_S_7091     0.0  NaN    NaN     -1
50980  941_S_7106     1.0  NaN    NaN     -1

[4790 rows x 5 columns]


In [None]:
for i in indexes:
    row = m.loc[i]
    if (row["GroupN"] == row["DX"]):
        val = row["GroupN"]
        m.loc[i, "GROUP"] = val


Unnamed: 0,Subject,GroupN,DX,Phase,GROUP
0,002_S_0295,0.0,0.0,ADNI1,0
36,002_S_0295,0.0,0.0,ADNI2,0
40,002_S_0413,0.0,0.0,ADNI1,0
76,002_S_0413,0.0,0.0,ADNI2,0
101,002_S_0413,0.0,0.0,ADNI3,0
...,...,...,...,...,...
50976,941_S_7074,0.0,,,-1
50977,941_S_7085,1.0,,,-1
50978,941_S_7087,0.0,,,-1
50979,941_S_7091,0.0,,,-1


In [41]:
m5 = m5[~m5.index.duplicated(keep='first')]
m5

Unnamed: 0,Subject,GroupN,DX,Phase
0,002_S_0295,0.0,0.0,ADNI1
36,002_S_0295,0.0,0.0,ADNI2
40,002_S_0413,0.0,0.0,ADNI1
76,002_S_0413,0.0,0.0,ADNI2
101,002_S_0413,0.0,0.0,ADNI3
...,...,...,...,...
50950,941_S_6580,0.0,0.0,ADNI3
50960,941_S_6581,0.0,0.0,ADNI3
50964,941_S_6607,0.0,0.0,ADNI3
50965,941_S_6803,1.0,1.0,ADNI3


In [None]:
m[m["GROUP"] != -1]

Unnamed: 0,Subject,GroupN,DX,Phase,GROUP
176,002_S_0729,2.0,1.0,ADNI1,-1
283,002_S_0954,2.0,1.0,ADNI1,-1
328,002_S_1070,2.0,1.0,ADNI1,-1
435,002_S_1261,1.0,0.0,ADNI1,-1
460,002_S_1261,1.0,0.0,ADNIGO,-1
...,...,...,...,...,...
50976,941_S_7074,0.0,,,-1
50977,941_S_7085,1.0,,,-1
50978,941_S_7087,0.0,,,-1
50979,941_S_7091,0.0,,,-1


In [44]:
m[["Subject", "GroupN", "DX", "GROUP", "Phase"]].to_csv("diagnosis_full.csv")