## Part1: Data visualization

### Step 1: Import pandas and numpy

In [115]:
import pandas as pd
import numpy as np

### Step 2: Load region names (Data/ADNI/List_NamesOfRegions.csv)

In [116]:
df_region_name = pd.read_csv("./Data/ADNI/List_NamesOfRegions.csv")

In [117]:
df_region_name.head()

Unnamed: 0,ROI_INDEX,ROI_NAME
0,702,ICV
1,701,TOTALBRAIN
2,601,GM
3,604,WM
4,606,GM_L


### Step 3: Load brain volumes (Data/ADNI/ADNI_BrainVolumes.csv)

In [118]:
df_brain_volumes = pd.read_csv("./Data/ADNI/ADNI_BrainVolumes.csv")

In [119]:
df_brain_volumes.head()

Unnamed: 0,ID,702,701,601,604,606,607,613,614,501,...,198,199,200,201,202,203,204,205,206,207
0,1,1483889.0,1242659.0,584259.696786,593055.78656,287082.643644,289687.720591,287688.033995,292001.70392,11366.362049,...,8057.808857,7321.637315,4164.959054,4079.529406,7783.590231,6707.809467,2134.686535,2577.655085,908.085527,986.132367
1,2,1397874.0,1193265.0,611098.838194,528088.857146,299281.430413,258060.149961,303685.791972,259086.357766,10942.349419,...,9586.025433,11162.778434,7183.45464,5957.911505,6452.558433,7376.461864,2268.625889,2143.11846,1103.199758,733.005576
2,3,1571713.0,1300427.0,599035.861981,637024.633868,296004.241173,312828.608566,295158.38218,312072.397971,12123.627331,...,7297.379513,6589.684521,7574.7622,5597.224032,8418.511819,9862.378354,2534.412917,2129.4131,988.241741,1014.608916
3,4,1785075.0,1395203.0,671843.252806,582753.668777,336478.502221,288573.470894,325163.798096,281681.077954,12499.119928,...,9438.412306,10744.11735,7041.104096,6455.751673,6937.744569,7258.37004,2168.440686,2208.51887,1077.892209,1530.353811
4,5,1386439.0,1153623.0,539974.138209,547297.880759,263924.731522,267504.337255,266393.752447,268997.773236,10795.770268,...,7243.586381,7402.844032,4052.105253,3701.949359,6714.133795,6586.516737,1783.474748,2357.224165,414.491766,672.889941


### Step 4: Load demog&clin info (Data/ADNI/ADNI_DemogClin.csv)

In [120]:
df_demog = pd.read_csv("./Data/ADNI/ADNI_DemogClin.csv")

In [121]:
df_demog.head()

Unnamed: 0,SCID,Age,Sex,Dx
0,1,84.8,M,CN
1,2,76.3,F,CN
2,3,79.3,M,CN
3,4,77.5,M,AD
4,5,89.6,F,CN


### Step 5: Display diagnosis (Dx) types

In [122]:
df_demog.groupby(df_demog.Dx).describe()

Unnamed: 0_level_0,SCID,SCID,SCID,SCID,SCID,SCID,SCID,SCID,Age,Age,Age,Age,Age,Age,Age,Age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Dx,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
AD,191.0,402.848168,238.262401,4.0,193.0,408.0,603.5,802.0,191.0,75.295288,7.429516,55.1,70.8,75.8,80.7,90.9
CN,228.0,400.758772,243.22345,1.0,177.5,417.5,605.5,812.0,228.0,75.871053,5.031316,59.9,72.275,75.6,78.5,89.6
MCI,396.0,414.65404,229.817933,6.0,225.75,404.5,616.25,815.0,396.0,74.74596,7.405667,54.4,70.175,75.1,80.4,89.3


### Step 6: Select only AD (Alzheimer Disease) and CN (Normal Control) subjects

In [123]:
select_group = ["AD","CN"]
select_AD_CN_df = df_demog.loc[df_demog.Dx.isin(select_group)]
select_AD_CN_df.head()

Unnamed: 0,SCID,Age,Sex,Dx
0,1,84.8,M,CN
1,2,76.3,F,CN
2,3,79.3,M,CN
3,4,77.5,M,AD
4,5,89.6,F,CN


### Step 7: Display summary stats of AD and CN demographics (functions groupby() and describe())

In [124]:
select_AD_CN_df.describe()

Unnamed: 0,SCID,Age
count,419.0,419.0
mean,401.711217,75.608592
std,240.68952,6.238545
min,1.0,55.1
25%,188.5,71.6
50%,417.0,75.7
75%,604.5,79.8
max,812.0,90.9


### Step 8: Replace "Sex" by a numeric var "IsF"

In [125]:
select_AD_CN_df = select_AD_CN_df.rename(columns={"Sex": "IsF"})
select_AD_CN_df['IsF'] = select_AD_CN_df['IsF'].apply(lambda x: 1 if x == "F" else 0)
select_AD_CN_df.head()

Unnamed: 0,SCID,Age,IsF,Dx
0,1,84.8,0,CN
1,2,76.3,1,CN
2,3,79.3,0,CN
3,4,77.5,0,AD
4,5,89.6,1,CN


### Step 9: Replace "Dx" by a numeric var "IsAD"

In [126]:
select_AD_CN_df['Dx'] = select_AD_CN_df['Dx'].apply(lambda x: 1 if x == "AD" else 0)
select_AD_CN_df = select_AD_CN_df.rename(columns={"Dx": "IsAD"})
select_AD_CN_df.head()

Unnamed: 0,SCID,Age,IsF,IsAD
0,1,84.8,0,0
1,2,76.3,1,0
2,3,79.3,0,0
3,4,77.5,0,1
4,5,89.6,1,0


### Step 10: Select few regions: Total brain, GM (gray matter), WM (white matter), Hippocampus 

In [68]:
Select_group = ["TOTALBRAIN","GM","WM","Right Hippocampus"]
df_region_name_select = df_region_name.loc[df_region_name.ROI_NAME.isin(Select_group)]
df_region_name_select.head()

Unnamed: 0,ROI_INDEX,ROI_NAME
1,701,TOTALBRAIN
2,601,GM
3,604,WM
128,47,Right Hippocampus


### Step 11: Merge selected volumes with Demographic data for AD/CN subjects ( function merge())

In [129]:
merage_df = df_brain_volumes.merge(select_AD_CN_df, left_on='ID', right_on='SCID')
merage_df.head()

Unnamed: 0,ID,702,701,601,604,606,607,613,614,501,...,202,203,204,205,206,207,SCID,Age,IsF,IsAD
0,1,1483889.0,1242659.0,584259.696786,593055.78656,287082.643644,289687.720591,287688.033995,292001.70392,11366.362049,...,7783.590231,6707.809467,2134.686535,2577.655085,908.085527,986.132367,1,84.8,0,0
1,2,1397874.0,1193265.0,611098.838194,528088.857146,299281.430413,258060.149961,303685.791972,259086.357766,10942.349419,...,6452.558433,7376.461864,2268.625889,2143.11846,1103.199758,733.005576,2,76.3,1,0
2,3,1571713.0,1300427.0,599035.861981,637024.633868,296004.241173,312828.608566,295158.38218,312072.397971,12123.627331,...,8418.511819,9862.378354,2534.412917,2129.4131,988.241741,1014.608916,3,79.3,0,0
3,4,1785075.0,1395203.0,671843.252806,582753.668777,336478.502221,288573.470894,325163.798096,281681.077954,12499.119928,...,6937.744569,7258.37004,2168.440686,2208.51887,1077.892209,1530.353811,4,77.5,0,1
4,5,1386439.0,1153623.0,539974.138209,547297.880759,263924.731522,267504.337255,266393.752447,268997.773236,10795.770268,...,6714.133795,6586.516737,1783.474748,2357.224165,414.491766,672.889941,5,89.6,1,0


### Step 12: Display total brain volume across age for CN subjects (seaborn regplot or lmplot)

In [130]:
import seaborn

### Step 13: Select only female subjects

In [141]:
select_AD_CN_df_F = select_AD_CN_df.loc[select_AD_CN_df["IsF"] == 1]
select_AD_CN_df_F.head()

Unnamed: 0,SCID,Age,IsF,IsAD
1,2,76.3,1,0
4,5,89.6,1,0
8,9,82.2,1,1
10,11,78.2,1,1
11,12,70.7,1,1


### Step 14: Display total brain volume across age for female subjects for CN/AD

### Step 15: Display GM volume across age for female subjects for CN/AD

## Part 2: Machine Learning - A simple classification model

### Step 16: Create a dataframe with only the ID column for AD and CN subjects

In [144]:
IDList = select_AD_CN_df["SCID"].tolist()

### Step 17: Create a data frame with the IDs of selected subjects (function merge())

In [148]:
df_brain_volumes_select = df_brain_volumes.loc[df_brain_volumes.ID.isin(IDList)]
df_brain_volumes_select.head()

Unnamed: 0,ID,702,701,601,604,606,607,613,614,501,...,198,199,200,201,202,203,204,205,206,207
0,1,1483889.0,1242659.0,584259.696786,593055.78656,287082.643644,289687.720591,287688.033995,292001.70392,11366.362049,...,8057.808857,7321.637315,4164.959054,4079.529406,7783.590231,6707.809467,2134.686535,2577.655085,908.085527,986.132367
1,2,1397874.0,1193265.0,611098.838194,528088.857146,299281.430413,258060.149961,303685.791972,259086.357766,10942.349419,...,9586.025433,11162.778434,7183.45464,5957.911505,6452.558433,7376.461864,2268.625889,2143.11846,1103.199758,733.005576
2,3,1571713.0,1300427.0,599035.861981,637024.633868,296004.241173,312828.608566,295158.38218,312072.397971,12123.627331,...,7297.379513,6589.684521,7574.7622,5597.224032,8418.511819,9862.378354,2534.412917,2129.4131,988.241741,1014.608916
3,4,1785075.0,1395203.0,671843.252806,582753.668777,336478.502221,288573.470894,325163.798096,281681.077954,12499.119928,...,9438.412306,10744.11735,7041.104096,6455.751673,6937.744569,7258.37004,2168.440686,2208.51887,1077.892209,1530.353811
4,5,1386439.0,1153623.0,539974.138209,547297.880759,263924.731522,267504.337255,266393.752447,268997.773236,10795.770268,...,7243.586381,7402.844032,4052.105253,3701.949359,6714.133795,6586.516737,1783.474748,2357.224165,414.491766,672.889941


### Step 18: Create X (brain volumes of selected subjects) and y (Dx labels of selected subjects)

### Step 19: Load sklearn packages

In [18]:
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing



### Step 20: Split data into train/test (60/40%)

### Step 21: Scale data ( sklearn.preprocessing.StandardScaler)

### Step 22: Train an SVM classifier using a linear kernel; train it using scaled training data

### Step 23: Calculate predictions for the test set

### Step 24: Calculate prediction accuracy