In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


# Import data

In [2]:
brain_data = pd.read_csv('./data/eig_centrality.csv', header=None)
brain_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,285,286,287,288,289,290,291,292,293,294
0,1,-0.067894,-0.062832,0,0,0,-0.03124,-0.071004,0,0,...,-0.11276,-0.10607,0.0,0.0,-0.028195,-0.024616,-0.042535,-0.087451,0.003145,-0.047196
1,1,0.031793,0.025996,0,0,0,0.12203,0.10922,0,0,...,0.078364,0.049836,0.0,6.938900000000001e-18,-0.076947,-0.078418,-0.064815,0.095082,0.054585,0.076347
2,1,-0.005251,-0.015806,0,0,0,-0.12404,-0.11702,0,0,...,0.021125,0.01447,0.0,-1.7347000000000001e-18,0.068705,0.084321,0.047107,0.004649,-0.14276,-0.12945
3,1,0.12315,0.096576,0,0,0,-0.045236,-0.005143,0,0,...,-0.043127,-0.024063,5.5511000000000006e-17,-1.1102e-16,-0.06451,-0.10038,-0.11295,-0.099919,-0.006036,-0.004228
4,1,-0.059846,-0.0245,0,0,0,-0.022105,0.022153,0,0,...,-0.026404,-0.004397,-4.3367999999999994e-19,0.0,-0.068379,0.006091,-0.050172,0.056009,0.077894,0.0165


# Data preprocessing

In [3]:
# Optional: drop zero columns
brain_data = brain_data.loc[:, (brain_data != 0).any(axis=0)]
brain_data.head()

Unnamed: 0,0,1,2,6,7,10,12,13,14,15,...,285,286,287,288,289,290,291,292,293,294
0,1,-0.067894,-0.062832,-0.03124,-0.071004,-0.030888,0.009534,0.04466,0.043163,0.02544,...,-0.11276,-0.10607,0.0,0.0,-0.028195,-0.024616,-0.042535,-0.087451,0.003145,-0.047196
1,1,0.031793,0.025996,0.12203,0.10922,-0.005733,0.017223,-0.072822,-0.077072,-0.12665,...,0.078364,0.049836,0.0,6.938900000000001e-18,-0.076947,-0.078418,-0.064815,0.095082,0.054585,0.076347
2,1,-0.005251,-0.015806,-0.12404,-0.11702,-0.017355,-0.035984,0.10695,0.011851,0.077032,...,0.021125,0.01447,0.0,-1.7347000000000001e-18,0.068705,0.084321,0.047107,0.004649,-0.14276,-0.12945
3,1,0.12315,0.096576,-0.045236,-0.005143,-0.017959,-0.060064,0.008232,-0.07691,-0.042061,...,-0.043127,-0.024063,5.5511000000000006e-17,-1.1102e-16,-0.06451,-0.10038,-0.11295,-0.099919,-0.006036,-0.004228
4,1,-0.059846,-0.0245,-0.022105,0.022153,0.079495,-0.013115,0.019867,0.032033,0.012232,...,-0.026404,-0.004397,-4.3367999999999994e-19,0.0,-0.068379,0.006091,-0.050172,0.056009,0.077894,0.0165


# PCA

This might not be the best approach because we have the one hot encoded categorical variable (binary classification)

By applying PCA, we lose some of the variance (i.e., information). By reducing the dimensionality of the data, PCA will reduce the size of the data.
 - This will improve the performance of machine learning algorithms.
 - This will reduce hardware requirements and speed up the training process.
 - This will allow us to easily understand the underlying structure of the data.
 - This will allow us to visualize the data on a 2d or 3d plot (if we choose the number of principal components as 2 or 3).

## Exploratory work

In [4]:
# Get feature matrix
X = brain_data.iloc[:,1:].values
y = brain_data.iloc[:,0].values
print(X)
print(y)

[[-6.7894e-02 -6.2832e-02 -3.1240e-02 ... -8.7451e-02  3.1445e-03
  -4.7196e-02]
 [ 3.1793e-02  2.5996e-02  1.2203e-01 ...  9.5082e-02  5.4585e-02
   7.6347e-02]
 [-5.2506e-03 -1.5806e-02 -1.2404e-01 ...  4.6495e-03 -1.4276e-01
  -1.2945e-01]
 ...
 [ 2.6920e-03 -1.4674e-02  1.0568e-01 ...  3.7322e-02  8.6381e-02
   9.1878e-02]
 [-6.3993e-02 -2.5404e-02  4.5130e-02 ...  6.4920e-05 -5.0832e-03
  -1.0966e-02]
 [-2.8025e-03 -6.5748e-02  1.1062e-01 ... -1.5098e-02  1.2126e-01
   8.5409e-02]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [5]:
# Optional: Standardize the features
from sklearn.preprocessing import StandardScaler

## Create object
scaler = StandardScaler()
## Calculate mean and std
scaler.fit(X)
## Transform the values
X_scaled = scaler.transform(X)

In [6]:
X_scaled

array([[-0.99040396, -1.04567513, -0.2574618 , ..., -1.68464325,
         0.11618804, -0.65112483],
       [ 0.49798482,  0.43177739,  1.95851501, ...,  2.0075361 ,
         0.93079808,  1.4537202 ],
       [-0.05509912, -0.26350417, -1.59916368, ...,  0.17831613,
        -2.19435075, -2.05251482],
       ...,
       [ 0.06348883, -0.24467591,  1.7221268 , ...,  0.83919789,
         1.43431847,  1.71832725],
       [-0.93215961, -0.42314515,  0.84669524, ...,  0.08558171,
        -0.01410554, -0.03386173],
       [-0.01854746, -1.09417618,  1.79354929, ..., -0.22112568,
         1.98666115,  1.60811265]])

In [7]:
# apply PCA to all dimensions
from sklearn.decomposition import PCA
pca_60 = PCA(n_components=60, random_state=42)
pca_60.fit(X_scaled)
X_pca_60 = pca_60.transform(X_scaled)
print('Variance explained by all 60 principal components = ', sum(pca_60.explained_variance_ratio_ * 100))

Variance explained by all 60 principal components =  99.99999999999997


In [8]:
# The explained_variance_ratio_ attribute of the PCA() class returns
# a one-dimensional numpy array which contains the values of the
# percentage of variance explained by each of the selected components.
pca_60.explained_variance_ratio_ * 100

array([2.59975473e+01, 1.27918210e+01, 9.35485108e+00, 6.63703361e+00,
       4.78961028e+00, 3.67486653e+00, 2.81412581e+00, 2.47085484e+00,
       2.25369908e+00, 1.95699706e+00, 1.72252182e+00, 1.45942950e+00,
       1.33707248e+00, 1.29905339e+00, 1.18769281e+00, 1.15058586e+00,
       1.03287243e+00, 9.48065441e-01, 9.20433277e-01, 8.70535634e-01,
       8.33650147e-01, 8.03077815e-01, 7.84734702e-01, 7.41623817e-01,
       6.61276893e-01, 6.42233980e-01, 6.23357752e-01, 5.97073433e-01,
       5.91577460e-01, 5.46454314e-01, 5.40319052e-01, 5.06626816e-01,
       4.85049167e-01, 4.55310525e-01, 4.27236085e-01, 4.05024750e-01,
       4.02204079e-01, 3.85540119e-01, 3.80616470e-01, 3.70435621e-01,
       3.34178155e-01, 3.17632003e-01, 3.08623537e-01, 2.94555314e-01,
       2.69555043e-01, 2.65071036e-01, 2.40554622e-01, 2.34906840e-01,
       2.24758010e-01, 2.17130061e-01, 2.05723039e-01, 1.85459986e-01,
       1.79234819e-01, 1.66913132e-01, 1.56353157e-01, 1.52358589e-01,
      

In [10]:
# or look at the cumulutive sum as we add more componenets
np.cumsum(pca_60.explained_variance_ratio_ * 100)

array([ 25.99754727,  38.78936828,  48.14421936,  54.78125296,
        59.57086325,  63.24572978,  66.05985559,  68.53071043,
        70.78440951,  72.74140657,  74.46392839,  75.9233579 ,
        77.26043038,  78.55948377,  79.74717659,  80.89776245,
        81.93063487,  82.87870032,  83.79913359,  84.66966923,
        85.50331937,  86.30639719,  87.09113189,  87.83275571,
        88.4940326 ,  89.13626658,  89.75962433,  90.35669777,
        90.94827523,  91.49472954,  92.03504859,  92.54167541,
        93.02672457,  93.4820351 ,  93.90927119,  94.31429594,
        94.71650001,  95.10204013,  95.4826566 ,  95.85309223,
        96.18727038,  96.50490238,  96.81352592,  97.10808123,
        97.37763628,  97.64270731,  97.88326193,  98.11816878,
        98.34292678,  98.56005685,  98.76577988,  98.95123987,
        99.13047469,  99.29738782,  99.45374098,  99.60609957,
        99.74692909,  99.8844851 , 100.        , 100.        ])

In [11]:
'''
It's incorrect to use PCA on the Y variable
'''
# # Leave X and Y in and don't standardize
# X = brain_data.values
# print(X.shape)

# # apply PCA to all dimensions
# pca_60 = PCA(n_components=60, random_state=42)
# pca_60.fit(X)
# X_pca_60 = pca_60.transform(X)
# print('Variance explained by all 60 principal components = ', sum(pca_60.explained_variance_ratio_ * 100))
# print(pca_60.explained_variance_ratio_ * 100)

"\nIt's incorrect to use PCA on the Y variable\n"

In [12]:
'''
Already Done above
'''
# # What if we just use the X and leave out the categorical variable?
# X = brain_data.iloc[:,1:].values
# print(X.shape)
# ## Create object
# scaler = StandardScaler()
# ## Calculate mean and std
# scaler.fit(X)
# ## Transform the values
# X_scaled = scaler.transform(X)

# # apply PCA to all dimensions
# pca_60_justx = PCA(n_components=60, random_state=42)
# pca_60_justx.fit(X_scaled)
# X_pca_60 = pca_60_justx.transform(X_scaled)
# print('Variance explained by all 60 principal components = ', sum(pca_60_justx.explained_variance_ratio_ * 100))
# print(pca_60_justx.explained_variance_ratio_ * 100)

'\nAlready Done above\n'

In [13]:
# What if we just use the X and not scale 
X = brain_data.iloc[:,1:].values
print(X.shape)
# apply PCA to all dimensions
pca_60_justx = PCA(n_components=60, random_state=42)
pca_60_justx.fit(X)
X_pca_60 = pca_60_justx.transform(X)
print('Variance explained by all 60 principal components = ', sum(pca_60_justx.explained_variance_ratio_ * 100))
print(pca_60_justx.explained_variance_ratio_ * 100)
np.cumsum(pca_60_justx.explained_variance_ratio_ * 100)

(60, 273)
Variance explained by all 60 principal components =  100.00000000000001
[3.08401247e+01 1.36832266e+01 8.86041715e+00 6.73224718e+00
 3.92570628e+00 3.25716309e+00 2.62305179e+00 2.10346986e+00
 1.95442231e+00 1.69346556e+00 1.62997228e+00 1.33043129e+00
 1.19194568e+00 1.13889867e+00 1.11098998e+00 9.89226434e-01
 9.23090410e-01 8.79672312e-01 8.04543058e-01 7.66966814e-01
 7.32158085e-01 6.95247262e-01 6.58202356e-01 6.39522245e-01
 6.13096709e-01 5.95445835e-01 5.60188733e-01 5.43589574e-01
 5.13235675e-01 4.60903066e-01 4.54490528e-01 4.47794175e-01
 4.21259323e-01 4.01435707e-01 3.80009387e-01 3.61660059e-01
 3.55708575e-01 3.39745462e-01 3.36965106e-01 3.21493102e-01
 3.04312190e-01 2.77058158e-01 2.72082934e-01 2.57085629e-01
 2.39867803e-01 2.31756902e-01 2.20688301e-01 2.14052121e-01
 2.07297836e-01 1.91406679e-01 1.80472512e-01 1.68442029e-01
 1.62971832e-01 1.54763021e-01 1.46707797e-01 1.39108601e-01
 1.29629620e-01 1.28302743e-01 1.02810922e-01 6.08214216e-30]


array([ 30.84012466,  44.52335128,  53.38376843,  60.11601561,
        64.04172189,  67.29888497,  69.92193676,  72.02540662,
        73.97982892,  75.67329448,  77.30326676,  78.63369805,
        79.82564373,  80.9645424 ,  82.07553238,  83.06475882,
        83.98784923,  84.86752154,  85.6720646 ,  86.43903141,
        87.1711895 ,  87.86643676,  88.52463911,  89.16416136,
        89.77725807,  90.3727039 ,  90.93289263,  91.47648221,
        91.98971788,  92.45062095,  92.90511148,  93.35290565,
        93.77416497,  94.17560068,  94.55561007,  94.91727013,
        95.2729787 ,  95.61272416,  95.94968927,  96.27118237,
        96.57549456,  96.85255272,  97.12463565,  97.38172128,
        97.62158908,  97.85334599,  98.07403429,  98.28808641,
        98.49538424,  98.68679092,  98.86726344,  99.03570546,
        99.1986773 ,  99.35344032,  99.50014811,  99.63925671,
        99.76888633,  99.89718908, 100.        , 100.        ])

## Reducing the number of features meaningfully

In [14]:
# Get X values
X = brain_data.iloc[:,1:].values

pca_95per = PCA(n_components=0.95, random_state=42)
pca_95per.fit(X)
X_pca_95per = pca_95per.transform(X)
print('Number of principal Componenets', len(pca_95per.explained_variance_ratio_))
print(pca_95per.explained_variance_ratio_ * 100)
np.cumsum(pca_95per.explained_variance_ratio_ * 100)

Number of principal Componenets 37
[30.84012466 13.68322661  8.86041715  6.73224718  3.92570628  3.25716309
  2.62305179  2.10346986  1.95442231  1.69346556  1.62997228  1.33043129
  1.19194568  1.13889867  1.11098998  0.98922643  0.92309041  0.87967231
  0.80454306  0.76696681  0.73215808  0.69524726  0.65820236  0.63952224
  0.61309671  0.59544583  0.56018873  0.54358957  0.51323567  0.46090307
  0.45449053  0.44779417  0.42125932  0.40143571  0.38000939  0.36166006
  0.35570857]


array([30.84012466, 44.52335128, 53.38376843, 60.11601561, 64.04172189,
       67.29888497, 69.92193676, 72.02540662, 73.97982892, 75.67329448,
       77.30326676, 78.63369805, 79.82564373, 80.9645424 , 82.07553238,
       83.06475882, 83.98784923, 84.86752154, 85.6720646 , 86.43903141,
       87.1711895 , 87.86643676, 88.52463911, 89.16416136, 89.77725807,
       90.3727039 , 90.93289263, 91.47648221, 91.98971788, 92.45062095,
       92.90511148, 93.35290565, 93.77416497, 94.17560068, 94.55561007,
       94.91727013, 95.2729787 ])

## For Fun: Logistic Regression and SVM


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm

# Get X and y values
X = brain_data.iloc[:,1:].values
y = brain_data.iloc[:,0].values
print(X.shape)
print(y.shape)

# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
## Verify
print(X_train.shape)
print(y_train.shape)

# Initialize pca, logistic regression model, and SVM
pca = PCA(n_components=0.99, random_state=42)
lr = LogisticRegression(multi_class='auto', solver='liblinear')
clf = svm.SVC(kernel='rbf')

# Fit and transform data
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr.fit(X_train_pca, y_train)
clf.fit(X_train_pca, y_train)

# Get Results
print('-------------')
print('Logistic Regression')
train_score = lr.score(X_train_pca, y_train)
print(f'Train Accuracy: {train_score}')
test_score = lr.score(X_test_pca, y_test)
print(f'Test Accuracy: {test_score}')
print('-------------')
print('SVM')
train_score = clf.score(X_train_pca, y_train)
print(f'Train Accuracy: {train_score}')
test_score = clf.score(X_test_pca, y_test)
print(f'Test Accuracy: {test_score}')
print('--------------')



(60, 273)
(60,)
(48, 273)
(48,)
-------------
Logistic Regression
Train Accuracy: 0.7916666666666666
Test Accuracy: 0.4166666666666667
-------------
SVM
Train Accuracy: 0.9375
Test Accuracy: 0.4166666666666667
--------------


# Model Testing

In [16]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import time

# Get X and y values
X = brain_data.iloc[:,1:].values
y = brain_data.iloc[:,0].values
print(X.shape)
print(y.shape)

# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2021)
## Verify
print(X_train.shape)
print(y_train.shape)

# Initialize pca, logistic regression model, and SVM
pca = PCA(n_components=0.95, random_state=42)
models = [AdaBoostClassifier(),
         GradientBoostingClassifier(),
         RandomForestClassifier(), 
         DecisionTreeClassifier(),
         svm.SVC(kernel='rbf')]

# Fit and transform data
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print('#### PCA Performance ####')
for model in models:
    print('-----------------')
    print(f'{model} being used')
    model.fit(X_train_pca, y_train)
    train_score = model.score(X_train_pca, y_train)
    print(f'Train Accuracy: {train_score}')
    test_score = model.score(X_test_pca, y_test)
    print(f'Test Accuracy: {test_score}')
    time.sleep(3)

print('#### Full data Performance ####')
for model in models:
    print('-----------------')
    print(f'{model} being used')
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    print(f'Train Accuracy: {train_score}')
    test_score = model.score(X_test, y_test)
    print(f'Test Accuracy: {test_score}')
    time.sleep(3)

(60, 273)
(60,)
(54, 273)
(54,)
#### PCA Performance ####
-----------------
AdaBoostClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.5
-----------------
GradientBoostingClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.5
-----------------
RandomForestClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.5
-----------------
DecisionTreeClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.5
-----------------
SVC() being used
Train Accuracy: 0.9074074074074074
Test Accuracy: 0.8333333333333334
#### Full data Performance ####
-----------------
AdaBoostClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.3333333333333333
-----------------
GradientBoostingClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.5
-----------------
RandomForestClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.8333333333333334
-----------------
DecisionTreeClassifier() being used
Train Accuracy: 1.0
Test Accuracy: 0.3333333333333333
-----------------

## Cross validation

In [19]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold, cross_validate
import time

# Get X and y values
X = brain_data.iloc[:,1:].values
y = brain_data.iloc[:,0].values
print(X.shape)
print(y.shape)

# Initialize pca, logistic regression model, and SVM
pca = PCA(n_components=0.95, random_state=42)
models = [AdaBoostClassifier(),
         GradientBoostingClassifier(),
         RandomForestClassifier(), 
         DecisionTreeClassifier(),
         ExtraTreesClassifier(),
         svm.SVC(kernel='rbf'),
         SGDClassifier()
         ]

# Fit and transform data
X_pca = pca.fit_transform(X)

# Initialize Folds
k_fold = KFold( n_splits=10,
                shuffle=True,
                random_state= 2021)

print('#### PCA Performance ####')
for model in models:
    print('-----------------')
    print(f'{model} being used')
    results = cross_validate(model, X_pca, y, cv=k_fold, return_train_score=True)
    train_score = results['train_score']
    test_score = results['test_score']
    print(f'train scores: {train_score}')
    print(f'test scores: {test_score}')
    del results
    del model
    time.sleep(2)
print('---------------------------------')
print('#### Full data Performance ####')
for model in models:
    print('-----------------')
    print(f'{model} being used')
    results = cross_validate(model, X, y, cv=k_fold, return_train_score=True)
    train_score = results['train_score']
    test_score = results['test_score']
    print(f'train scores: {train_score}')
    print(f'test scores: {test_score}')
    del results
    del model
    time.sleep(2)

ModuleNotFoundError: No module named 'xgboost'

# PLS

# LDA

# Random Projection

# Auto Encoder