# Basic Modeling with the ADNI Clinical Features

Key Documentation PDF: https://adni.loni.usc.edu/wp-content/themes/freshnews-dev-v2/documents/bio/inst_about_data.pdf

and the ADNIMERGE_Methods PDF which is in the related Google Drive Folder

In [1]:
# Imports
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import sklearn as sk
import math
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [2]:
drive.mount('/content/drive')
# full_df = pd.read_csv("/content/drive/MyDrive/Data_Science_Alzheimers_ADNI/ADNIMERGE_08Feb2024.csv")
#running as Liezl
full_df = pd.read_csv("/content/drive/MyDrive/ADNIMERGE_08Feb2024.csv")
full_df.head()

Mounted at /content/drive


  full_df = pd.read_csv("/content/drive/MyDrive/ADNIMERGE_08Feb2024.csv")


Unnamed: 0,RID,COLPROT,ORIGPROT,PTID,SITE,VISCODE,EXAMDATE,DX_bl,AGE,PTGENDER,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,FBB_bl,Years_bl,Month_bl,Month,M,update_stamp
0,2,ADNI1,ADNI1,011_S_0002,11,bl,2005-09-08,CN,74.3,Male,...,,1.33615,,,,0.0,0.0,0,0,2023-07-07 04:59:40.0
1,3,ADNI1,ADNI1,011_S_0003,11,bl,2005-09-12,AD,81.3,Male,...,22.83,1.1086,,,,0.0,0.0,0,0,2023-07-07 04:59:40.0
2,3,ADNI1,ADNI1,011_S_0003,11,m06,2006-03-13,AD,81.3,Male,...,22.83,1.1086,,,,0.498289,5.96721,6,6,2023-07-07 04:59:40.0
3,3,ADNI1,ADNI1,011_S_0003,11,m12,2006-09-12,AD,81.3,Male,...,22.83,1.1086,,,,0.999316,11.9672,12,12,2023-07-07 04:59:40.0
4,3,ADNI1,ADNI1,011_S_0003,11,m24,2007-09-12,AD,81.3,Male,...,22.83,1.1086,,,,1.99863,23.9344,24,24,2023-07-07 04:59:40.0


In [3]:
# print column labels nicely
i = 1
for col in sorted(full_df.columns):
  print(col, end="  ")
  if i % 7 == 0:
    print('')
  i = i+1

ABETA  ABETA_bl  ADAS11  ADAS11_bl  ADAS13  ADAS13_bl  ADASQ4  
ADASQ4_bl  AGE  APOE4  AV45  AV45_bl  CDRSB  CDRSB_bl  
COLPROT  DIGITSCOR  DIGITSCOR_bl  DX  DX_bl  EXAMDATE  EXAMDATE_bl  
EcogPtDivatt  EcogPtDivatt_bl  EcogPtLang  EcogPtLang_bl  EcogPtMem  EcogPtMem_bl  EcogPtOrgan  
EcogPtOrgan_bl  EcogPtPlan  EcogPtPlan_bl  EcogPtTotal  EcogPtTotal_bl  EcogPtVisspat  EcogPtVisspat_bl  
EcogSPDivatt  EcogSPDivatt_bl  EcogSPLang  EcogSPLang_bl  EcogSPMem  EcogSPMem_bl  EcogSPOrgan  
EcogSPOrgan_bl  EcogSPPlan  EcogSPPlan_bl  EcogSPTotal  EcogSPTotal_bl  EcogSPVisspat  EcogSPVisspat_bl  
Entorhinal  Entorhinal_bl  FAQ  FAQ_bl  FBB  FBB_bl  FDG  
FDG_bl  FLDSTRENG  FLDSTRENG_bl  FSVERSION  FSVERSION_bl  Fusiform  Fusiform_bl  
Hippocampus  Hippocampus_bl  ICV  ICV_bl  IMAGEUID  IMAGEUID_bl  LDELTOTAL  
LDELTOTAL_BL  M  MMSE  MMSE_bl  MOCA  MOCA_bl  MidTemp  
MidTemp_bl  Month  Month_bl  ORIGPROT  PIB  PIB_bl  PTAU  
PTAU_bl  PTEDUCAT  PTETHCAT  PTGENDER  PTID  PTMARRY  PTRACCAT  
RAVLT_

In [4]:
# shows the possible values for visits
print(full_df['VISCODE'].unique())

['bl' 'm06' 'm12' 'm24' 'm18' 'm36' 'm48' 'm60' 'm03' 'm30' 'm84' 'm42'
 'm72' 'm54' 'm66' 'm78' 'm108' 'm96' 'm90' 'm120' 'm114' 'm102' 'm126'
 'm132' 'm144' 'm156' 'm168' 'm180' 'm174' 'm162' 'm186' 'm138' 'm150'
 'm0' 'm192' 'm198' 'm204']


In [5]:
# Diagnosis delta
dx_delta_df = full_df[full_df['DX_bl'] != full_df["DX"]]
print("   total changes: ",len(dx_delta_df),'\n' ,"unique patients:  ", len(dx_delta_df['PTID'].unique()))

   total changes:  13636 
 unique patients:   2392


In [6]:
# List Diagnosis Possibilities
print('Baseline Diagnosis Options: ', full_df['DX_bl'].unique())
print('Later Diagnosis Options   : ',full_df['DX'].unique())


Baseline Diagnosis Options:  ['CN' 'AD' 'LMCI' 'SMC' 'EMCI' nan]
Later Diagnosis Options   :  ['CN' 'Dementia' 'MCI' nan]


In [7]:
# How many patients changed from Cognitevely Normal (CN) to Dementia? (turns out 85 patients)
start_cn_patients = full_df[full_df['DX_bl'] == 'CN']
cn_to_ad_patients = start_cn_patients[start_cn_patients['DX'] == 'Dementia']
print(len(cn_to_ad_patients))

cn_to_ad_patient_ids = cn_to_ad_patients['PTID']

85


In [8]:
# How many patients changed from Cognitevely Normal (CN) to Mild Cognitive Impairment (MCI)? (turns out 271 patients)
start_cn_patients = full_df[full_df['DX_bl'] == 'CN']
cn_to_mci_patients = start_cn_patients[start_cn_patients['DX'] == 'MCI']
print(len(cn_to_mci_patients))

cn_to_mci_patients = cn_to_ad_patients['PTID']

271


## Begin Modeling based on the patient baseline visits

Note: With the help of Dr. Talbert, we have decided to limit our data just to the baseline visits and ignore the time dimensionality of our data

##  Data Cleaning
* encode values: any string values need to be **hot encoded,** and numeric values need to be **normalized**
*  narrow the columns to interesting ones for the SVM (Support Vector machine) to use for its predictions
* for now we want to keep the participant rosterID and the participantID to be able to look up their information later

In [9]:
# select the baseline entries for each patient (yields 2430 patients)
baseline_df = full_df[full_df['VISCODE'] == 'bl']


In [10]:
# which columns are the datatype of object? (because they will probably be categorical features
#     that need to be encoded, or need data cleaning attention)
baseline_df.select_dtypes(include=object).columns

Index(['COLPROT', 'ORIGPROT', 'PTID', 'VISCODE', 'EXAMDATE', 'DX_bl',
       'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'ABETA', 'TAU', 'PTAU',
       'FLDSTRENG', 'FSVERSION', 'DX', 'EXAMDATE_bl', 'FLDSTRENG_bl',
       'FSVERSION_bl', 'ABETA_bl', 'TAU_bl', 'PTAU_bl', 'update_stamp'],
      dtype='object')

### Columns We're Choosing To ignore:
* colprot -- because it is the protocol under which the data is collected
* origprot -- original protocol the subject entered the study under
* ptid -- keeping to identify the patient as long as possible, then throwing out during actual training
* viscode -- the visit code
* SITE
* fieldstreng and FLDSTRENG_bl -- 1.5 or 3 Tesla MRI strength -- ignoring for now
* FSVERSION and FSVERSION_bl -- something about FreeSurver Version 4.3, 5.1, or 6.0 -- ignore
* update_stamp -- a datetime object

In [11]:
# make a copy of the baseline_df (to avoid View vs copy issues)
baseline_df_cleaned = baseline_df.copy()

In [12]:
# Remove Alligators Function
def remove_alligators(val):
  if(isinstance(val, float)):
    # no problem return
    return val

  # try to replace the alligators
  val = val.replace('>','')
  val = val.replace('<','')

  return val

# Normalize Function (assumes numeric value)
def custom_normalize(val, abs_max):
  return val/abs_max


In [13]:
# Cleaning the 'ABETA' column which has numeric data with some problem values like >1700

baseline_df_cleaned['ABETA'] = baseline_df['ABETA'].apply(remove_alligators)

# try to cast the column to float
baseline_df_cleaned['ABETA'] = baseline_df_cleaned['ABETA'].astype(np.float64)

# calculate the mean
abeta_mean = baseline_df_cleaned['ABETA'].mean()

# replace nan with the mean
baseline_df_cleaned['ABETA'].fillna(value=abeta_mean, inplace=True)

# normalize
print("Before Normalization")
print(baseline_df_cleaned['ABETA'].describe())

abs_max = abs(baseline_df_cleaned['ABETA'].max())

baseline_df_cleaned['ABETA'] = baseline_df_cleaned.ABETA / abs_max

print("After Normalization")
print(baseline_df_cleaned['ABETA'].describe())


Before Normalization
count    2430.000000
mean      979.928642
std       323.299213
min       200.000000
25%       854.200000
50%       979.928642
75%       979.928642
max      1700.000000
Name: ABETA, dtype: float64
After Normalization
count    2430.000000
mean        0.576429
std         0.190176
min         0.117647
25%         0.502471
50%         0.576429
75%         0.576429
max         1.000000
Name: ABETA, dtype: float64


In [14]:
# clean ABETA_bl column
baseline_df_cleaned['ABETA_bl'] = baseline_df['ABETA_bl'].apply(remove_alligators)

# try to cast the column to float
baseline_df_cleaned['ABETA_bl'] = baseline_df_cleaned['ABETA_bl'].astype(np.float64)

# calculate the mean
abeta_mean = baseline_df_cleaned['ABETA_bl'].mean()

# replace nan with the mean
baseline_df_cleaned['ABETA_bl'].fillna(value=abeta_mean, inplace=True)

# normalize
abs_max = abs(baseline_df_cleaned['ABETA_bl'].max())
baseline_df_cleaned['ABETA_bl'] = baseline_df_cleaned.ABETA_bl / abs_max


In [15]:
# Clean 'TAU' column (problem like >1300)

baseline_df_cleaned['TAU'] = baseline_df['TAU'].apply(remove_alligators)

# try to cast the column to float
baseline_df_cleaned['TAU'] = baseline_df_cleaned['TAU'].astype(np.float64)

# get the mean
tau_mean = baseline_df_cleaned['TAU'].mean()

# replace nan with mean
baseline_df_cleaned['TAU'].fillna(value=tau_mean, inplace=True)

# normalize
abs_max = abs(baseline_df_cleaned['TAU'].max())
baseline_df_cleaned['TAU'] = baseline_df_cleaned.TAU / abs_max

In [16]:
# clean TAU_bl column

baseline_df_cleaned['TAU_bl'] = baseline_df['TAU_bl'].apply(remove_alligators)

# try to cast the column to float
baseline_df_cleaned['TAU_bl'] = baseline_df_cleaned['TAU_bl'].astype(np.float64)

# get the mean
tau_mean = baseline_df_cleaned['TAU_bl'].mean()

# replace nan with mean
baseline_df_cleaned['TAU_bl'].fillna(value=tau_mean, inplace=True)

# normalize
abs_max = abs(baseline_df_cleaned['TAU_bl'].max())
baseline_df_cleaned['TAU_bl'] = baseline_df_cleaned.TAU_bl / abs_max

In [17]:
# clean PTAU column
baseline_df_cleaned['PTAU'] = baseline_df['PTAU'].apply(remove_alligators)

# cast as float
baseline_df_cleaned['PTAU'] = baseline_df_cleaned['PTAU'].astype(np.float64)

#get mean
ptau_mean = baseline_df_cleaned['PTAU'].mean()

# replace nan with mean
baseline_df_cleaned['PTAU'].fillna(value=ptau_mean, inplace=True)

# normalize
abs_max = abs(baseline_df_cleaned['PTAU'].max())
baseline_df_cleaned['PTAU'] = baseline_df_cleaned.PTAU / abs_max

In [18]:
# clean PTAU_bl column
baseline_df_cleaned['PTAU_bl'] = baseline_df['PTAU_bl'].apply(remove_alligators)

# cast as float
baseline_df_cleaned['PTAU_bl'] = baseline_df_cleaned['PTAU_bl'].astype(np.float64)

#get mean
ptau_mean = baseline_df_cleaned['PTAU_bl'].mean()

# replace nan with mean
baseline_df_cleaned['PTAU_bl'].fillna(value=ptau_mean, inplace=True)

# normalize
abs_max = abs(baseline_df_cleaned['PTAU_bl'].max())
baseline_df_cleaned['PTAU_bl'] = baseline_df_cleaned.PTAU_bl / abs_max

In [19]:
# Hot Encode Function
def hot_encode(categories):
  my_dict = {}

  i=0
  for c in categories:
    my_dict[c] = i
    i = i+1

  return my_dict

In [20]:
# # Hot Encode Patient Gender
# categories = baseline_df_cleaned['PTGENDER'].unique()
# my_dict = hot_encode(categories)

# baseline_df_cleaned['PTGENDER'] = baseline_df_cleaned['PTGENDER'].replace(my_dict)

# baseline_df_cleaned['PTGENDER'].unique()

In [21]:
# encode the diagnosis as has alzheimers/MCI(value of 1) or not (value of 0)
dx_dict = {
    'CN':0,
    'Dementia':1,
    'AD':1,
    'MCI':1
}

baseline_df_cleaned['DX'] = baseline_df_cleaned['DX'].replace(dx_dict)

# drop nan rows where there isn't a diagnosis
baseline_df_cleaned = baseline_df_cleaned.dropna(subset=['DX'])

In [22]:
# combine normalized columns and encoded columns (all of the cleaning)

# select the columns that contain categorical values and encode them
# Baseline Diagnosis(DX_bl), Diagnosis(DX), Gender(PTGENDER), Education (PTEDUCAT), Ethnicity(PTETHCAT),
# Race(PTRACCAT), Marital Status at BL (PTMARRY)

some_columns = ['DX','ABETA', 'TAU', 'PTAU', 'ABETA_bl', 'TAU_bl', 'PTAU_bl']
baseline_df_all_cleaned = baseline_df_cleaned[some_columns]

In [23]:
# split data into training and test
train, test = train_test_split(baseline_df_all_cleaned, random_state=104, test_size=0.30, shuffle=True)

In [24]:
# The SVM requires cleaned, encoded, normalized data

# Create a Support Vector Machine for Binary Classification of Dimentia vs CN

# input everything but the diagnosis code
inputs = train.drop(['DX'], axis=1)

# predict the diagnosis code
outputs = train['DX']

clf = SVC()
clf.fit(inputs,outputs)
SVC()

In [25]:
# then test the predictions on the test dataset
test_inputs = test.drop(['DX'], axis=1)

# prediction
test['prediction'] = clf.predict(test_inputs)

In [26]:
print(len(test))

723


In [27]:
# number of incorrect answers
incorrect = test[test['DX'] != test['prediction']]
print("Wrong: ", len(incorrect))

# difference (num correct)
num_correct = len(test)-len(incorrect)

# percent
print("Percent Right:", num_correct/len(test))

Wrong:  270
Percent Right: 0.6265560165975104
