# Standard Machine Learning methods

Here, we examine the performance of standard machine learning methods from the `scikit-learn` toolkit on our dataset of functional connectivities only.

The goal is to examine performance on these shallow methods to get a set of baseline results.

In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load data

In [3]:
DATA_FOLDER = '../data'
PICKLE_FOLDER = '../pickles'

In [4]:
with open(f'{PICKLE_FOLDER}/fc-pearson.pickle', 'rb') as f:
    fc_pearson = pickle.load(f)

In [5]:
with open(f'{PICKLE_FOLDER}/fc-spearman.pickle', 'rb') as f:
    fc_spearman = pickle.load(f)

In [6]:
with open(f'{PICKLE_FOLDER}/fc-partial-pearson.pickle', 'rb') as f:
    fc_partial_pearson = pickle.load(f)

In [7]:
total_samples, total_brain_regions, _ = fc_pearson.shape

print(f'Subjects: {total_samples}')
print(f'Brain regions: {total_brain_regions}')

Subjects: 190
Brain regions: 90


In [8]:
df_metadata = pd.read_csv(f'{DATA_FOLDER}/patients-cleaned.csv', index_col=0)

In [9]:
df_metadata.head(3)

Unnamed: 0,age,sex,target
0,24.75,1,0
1,27.667,1,0
2,34.167,1,0


## Split data

In [10]:
with open(f'{PICKLE_FOLDER}/test-indices.pickle', 'rb') as f:
    test_indices = pickle.load(f)

In [11]:
# Select dataset.
fc = fc_pearson.copy()

In [12]:
X_test = fc[test_indices]
y_test = df_metadata.iloc[test_indices]["target"].reset_index(drop=True)

In [13]:
train_indices = ~np.isin(np.arange(total_samples), test_indices)
X_train = fc[train_indices]
y_train = df_metadata.iloc[train_indices]["target"].reset_index(drop=True)

In [14]:
# Flatten FC matrices for ML models.
X_test_full = np.reshape(X_test, (-1, total_brain_regions * total_brain_regions))
X_train_full = np.reshape(X_train, (-1, total_brain_regions * total_brain_regions))

In [15]:
# Flatten FC matrices for ML models, but disregard diagonal and lower triangle.
X_test_triag = [np.hstack([row[i+1:] for i, row in enumerate(sample)]) for sample in X_test]
X_train_triag = [np.hstack([row[i+1:] for i, row in enumerate(sample)]) for sample in X_train]

In [16]:
# Flatten FC matrices for ML models, but disregard diagonal and lower triangle. Take absolute value.
X_test_triag_abs = np.abs(X_test_triag)
X_train_triag_abs = np.abs(X_train_triag)

## Metrics

In [17]:
def confusion_matrix_custom(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print(f"Accuracy: {(tp+tn)/(fn+tp+tn+fp):.2f}")
    print(f"Recall: {tp}/{fn+tp} ({tp/(fn+tp):.2f})")
    print(f"Precision: {tp}/{tp+fp} ({tp/(tp+fp):.2f})")
    print()

## Training

In [36]:
# Cross validation using 7 folds: 140 = 120 + 20.
# Per `GridSearchCV` documentation `StratifiedKFold` is used to get balanced folds.
NUM_FOLDS = 7
grid_kwargs = {
    "cv": NUM_FOLDS,
    "n_jobs": -1,
    "scoring": {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
    },
    "refit": "accuracy",
}


In [52]:
def cross_validate(estimator, estimator_params, grid_kwargs=grid_kwargs, X_data=X_train_triag_abs):
    """
    Performs cross-validation using train dataset on estimator.
    Optionally takes grid search settings.
    """
    grid = GridSearchCV(estimator, estimator_params, **grid_kwargs)
    grid.fit(X_data, y_train)
    results = pd.DataFrame(grid.cv_results_)
    results["rank"] = results["mean_test_accuracy"]-results["std_test_accuracy"]
    y_pred = grid.best_estimator_.predict(X_data)
    confusion_matrix_custom(y_train.values, y_pred)

    cv_params = ["params", "mean_test_accuracy", "std_test_accuracy"]# + [f"split{i}_test_accuracy" for i in range(NUM_FOLDS)]

    return results.sort_values(by=["rank"], ascending=False)[cv_params]

In [42]:
# Ensure we see the full `params` field.
pd.options.display.max_colwidth = 200

## ML Models
- KNN
- Naive Bayes
- Random Forest
- SVC
- Elastic Net based logistic regression

In [53]:
# Nearest neighbor.
knn = KNeighborsClassifier()
knn_params = {"n_neighbors": [1, 2, 3, 4, 5, 7, 10], "p": [1, 2], "weights": ("uniform", "distance")}
cross_validate(knn, knn_params)

Accuracy: 1.00
Recall: 75/75 (1.00)
Precision: 75/75 (1.00)



Unnamed: 0,params,mean_test_accuracy,std_test_accuracy
0,"{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}",0.692857,0.114731
5,"{'n_neighbors': 2, 'p': 1, 'weights': 'distance'}",0.692857,0.114731
1,"{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}",0.692857,0.114731
13,"{'n_neighbors': 4, 'p': 1, 'weights': 'distance'}",0.657143,0.094221
27,"{'n_neighbors': 10, 'p': 2, 'weights': 'distance'}",0.621429,0.08391
7,"{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}",0.657143,0.120797
3,"{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}",0.657143,0.120797
2,"{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}",0.657143,0.120797
18,"{'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}",0.621429,0.092029
19,"{'n_neighbors': 5, 'p': 2, 'weights': 'distance'}",0.621429,0.092029


In [54]:
# Naive Bayes.
gnb = GaussianNB()
gnb_params = {"var_smoothing": [1e-15, 1e-10, 1e-9, 1e-8, 1e-5, 1e-2, 1e-1]}
cross_validate(gnb, gnb_params)

Accuracy: 0.86
Recall: 57/75 (0.76)
Precision: 57/59 (0.97)



Unnamed: 0,params,mean_test_accuracy,std_test_accuracy
5,{'var_smoothing': 0.01},0.757143,0.056243
6,{'var_smoothing': 0.1},0.764286,0.087482
0,{'var_smoothing': 1e-15},0.728571,0.058902
1,{'var_smoothing': 1e-10},0.728571,0.058902
2,{'var_smoothing': 1e-09},0.728571,0.058902
3,{'var_smoothing': 1e-08},0.728571,0.058902
4,{'var_smoothing': 1e-05},0.728571,0.058902


In [45]:
# Random forest.
rf = RandomForestClassifier()
rf_params = {"n_estimators": [50, 100, 200, 500], "max_depth": [1,2,3,5, None], "criterion": ("entropy", "gini")}
cross_validate(rf, rf_params)

Accuracy: 1.00
Recall: 75/75 (1.00)
Precision: 75/75 (1.00)



Unnamed: 0,params,mean_test_accuracy,std_test_accuracy
35,"{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 500}",0.785714,0.098974
10,"{'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 200}",0.778571,0.079539
19,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 500}",0.771429,0.109731
31,"{'criterion': 'gini', 'max_depth': 3, 'n_estimators': 500}",0.771429,0.092029
39,"{'criterion': 'gini', 'max_depth': None, 'n_estimators': 500}",0.771429,0.08391
7,"{'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 500}",0.764286,0.098974
14,"{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 200}",0.75,0.075593
18,"{'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}",0.742857,0.139971
33,"{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}",0.735714,0.121638
25,"{'criterion': 'gini', 'max_depth': 2, 'n_estimators': 100}",0.735714,0.078895


In [46]:
# SVC.
svc = SVC()
svc_params = {"C": [0.1, 1, 10, 100, 1000], "kernel": ("poly", "rbf", "sigmoid"), "degree": [1,2,3,4]}
cross_validate(svc, svc_params)

Accuracy: 1.00
Recall: 75/75 (1.00)
Precision: 75/75 (1.00)



Unnamed: 0,params,mean_test_accuracy,std_test_accuracy
48,"{'C': 1000, 'degree': 1, 'kernel': 'poly'}",0.828571,0.079539
24,"{'C': 10, 'degree': 1, 'kernel': 'poly'}",0.828571,0.079539
36,"{'C': 100, 'degree': 1, 'kernel': 'poly'}",0.828571,0.079539
27,"{'C': 10, 'degree': 2, 'kernel': 'poly'}",0.821429,0.069985
15,"{'C': 1, 'degree': 2, 'kernel': 'poly'}",0.821429,0.095831
51,"{'C': 1000, 'degree': 2, 'kernel': 'poly'}",0.821429,0.069985
39,"{'C': 100, 'degree': 2, 'kernel': 'poly'}",0.821429,0.069985
34,"{'C': 10, 'degree': 4, 'kernel': 'rbf'}",0.807143,0.097938
25,"{'C': 10, 'degree': 1, 'kernel': 'rbf'}",0.807143,0.097938
37,"{'C': 100, 'degree': 1, 'kernel': 'rbf'}",0.807143,0.097938


In [47]:
# Elastic Net.
eln = SGDClassifier()
eln_params = {"loss": ["log", "modified_huber"], "alpha": [0.1, 0.01, 0.001], "penalty": ["elasticnet"], "l1_ratio": [0.15, 0.25, 0.5, 0.7], "max_iter": [1000, 10000, 20000]}
cross_validate(eln, eln_params)

Accuracy: 1.00
Recall: 75/75 (1.00)
Precision: 75/75 (1.00)



Unnamed: 0,params,mean_test_accuracy,std_test_accuracy
71,"{'alpha': 0.001, 'l1_ratio': 0.7, 'loss': 'modified_huber', 'max_iter': 20000, 'penalty': 'elasticnet'}",0.828571,0.074915
68,"{'alpha': 0.001, 'l1_ratio': 0.7, 'loss': 'log', 'max_iter': 20000, 'penalty': 'elasticnet'}",0.821429,0.069985
57,"{'alpha': 0.001, 'l1_ratio': 0.25, 'loss': 'modified_huber', 'max_iter': 1000, 'penalty': 'elasticnet'}",0.821429,0.088063
66,"{'alpha': 0.001, 'l1_ratio': 0.7, 'loss': 'log', 'max_iter': 1000, 'penalty': 'elasticnet'}",0.821429,0.074915
59,"{'alpha': 0.001, 'l1_ratio': 0.25, 'loss': 'modified_huber', 'max_iter': 20000, 'penalty': 'elasticnet'}",0.821429,0.092029
...,...,...,...
8,"{'alpha': 0.1, 'l1_ratio': 0.25, 'loss': 'log', 'max_iter': 20000, 'penalty': 'elasticnet'}",0.535714,0.022588
19,"{'alpha': 0.1, 'l1_ratio': 0.7, 'loss': 'log', 'max_iter': 10000, 'penalty': 'elasticnet'}",0.535714,0.022588
20,"{'alpha': 0.1, 'l1_ratio': 0.7, 'loss': 'log', 'max_iter': 20000, 'penalty': 'elasticnet'}",0.535714,0.022588
12,"{'alpha': 0.1, 'l1_ratio': 0.5, 'loss': 'log', 'max_iter': 1000, 'penalty': 'elasticnet'}",0.535714,0.022588


## Observations

- 1. KNN

Mean test score is best at 3 neighbors (71 += 6 %). Single and 5 neighbors produce similar and slightly worse scores (70 += 8 %). Interestingly the score falls sharply for 2 neighbors to (61 += 10 %) and 4 neighbors (61 += 7 %). There are always weak and strong folds: 55 - 85 %, hence large std.

Hyperparamters `p` and `weigths` don't affect results in a significant way.


- 2. Naive Bayes

Independantly of the `var_smoothing` hyperparameter the perfomance is consistently (79 += 9 %). Again, a large variance is obsererved. Weak folds get 65 to 70 percent and strong ones up to 90 % accuracy.

- 3. Random Forest

Random forests achieve 80 % accuracy, but the variance is large 10 - 13 %. Some folds are classified hundred percent correctly. The effect of hyperparameters results in about 4 % diffence, which is only third of variance.

- 4. SVC

Low polynomial kernels perform the best. They are not particularly sensitive to choice of `C` and `degree=[1,2,3]`. The accuracy is (85 += 10 %). A clear pattern of easy and hard folds can be observed. Four folds get 95 % accuracy and three 70 - 75 % accuracy.

- 5. Elastic Net alá logistic regression

This classifier proved to be the best with accuracy (87 += 8 %). It adds two percent to mean and sheds two percent from std in comparison to SVC. The `hinge` loss for Elastic net is actually equivalent to linear SVC, while the `log` loss gives logistic regression. Generally, the folds are not strictly weak or strong. Ussually, we get a weak one, one or two middle ones and then strong ones.

The above results are for full 8100 FC matrices.

 - Removing lower triangle and diagonal does not change results at all.
 - Taking absolute value reduces accuracy on average by 3 % and std by 1-2 %.


## Explainability

### Random forest and gini impurance

In [None]:
rf = RandomForestClassifier(criterion='gini', max_depth=3, n_estimators=50) # Best RF from above.
rf.fit(X_train, y_train)

In [None]:
sns.heatmap(np.reshape(rf.feature_importances_, (total_brain_regions, total_brain_regions)), cmap="YlGnBu")
plt.title(f"Important features for random forest model based on Gini.")
plt.show()

#### Save the importance matrix for dataset creation.

In [None]:
gini_matrix = np.reshape(np.where(rf.feature_importances_ > 0, True, False), (total_brain_regions, total_brain_regions))

with open(f'{PICKLE_FOLDER}/gini-importance-matrix.pickle', 'wb') as f:
    pickle.dump(gini_matrix, f)

### Best model - SGD classifier - and its coefficients

In [None]:
# Best SGD from above with std < 10 %.
eln = SGDClassifier(alpha=0.01, l1_ratio=0.5, loss='log', max_iter=20000, penalty='elasticnet')
eln.fit(X_train, y_train)

In [None]:
sns.heatmap(np.abs(np.reshape(eln.coef_, (total_brain_regions, total_brain_regions))), cmap="YlGnBu")
plt.title(f"Feature coefficients of SGD classifier.")
plt.show()

In [None]:
x = np.linspace(0, 1, 1000)
y = [np.sum(np.abs(eln.coef_) > i) for i in x]
plt.plot(x, y)
plt.title(f"Number of coefficients larger than 'x'.")
plt.show()

In [None]:
#### Save the coefficient matrix for dataset creation.
sgd_matrix = np.reshape(np.where(eln.coef_ > 0, True, False), (total_brain_regions, total_brain_regions))

with open(f'{PICKLE_FOLDER}/sgd-coefficients-matrix.pickle', 'wb') as f:
    pickle.dump(sgd_matrix, f)