# Global Network analysis
I have already tried to use the raw correlation values between brain regions as features for predicting adhd diagnosis. I will now try a different approach by calculating network measures for each subject's connectivity matrix. Network measures refer to specific graph summary statistics such as functional integration, functional segregation, centrality, and resilience. NetworkX implements many of these.

I will try global network measures in this notebook, and local network measures in another.

### Outcome:
None of the global measures showed much value in predicting ADHD. These measures were not used in my final analysis.

In [47]:
from collections import defaultdict, OrderedDict
import os
import sys
import re

from bs4 import BeautifulSoup

import pandas as pd

from scipy.stats import pearsonr
import numpy as np

import networkx as nx

import pickle as pkl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_conn_matrices():
    file_names = os.listdir("data/ADHD200_CC200")

    cm_file_re = r"^\S+connectivity_matrix_file\.txt$"

    conn_matrices = OrderedDict()
    for file_name in file_names:
        if re.match(cm_file_re, file_name):
            id_ = "".join(file_name.split("_")[:-3])
        
            cm = np.empty((190,190))
            with open("data/ADHD200_CC200/{}".format(file_name)) as f:
                for idx, row in enumerate(f):
                    row = row.strip().split(" ")
                    row = list(map(np.float, row))
                    cm[idx, :] = row
        
            conn_matrices[id_] = cm
    return conn_matrices

In [3]:
def get_regions():
    """
    Gets the names of the regions (in order of appearance in connectivity matrix). 
    All files have the same order of regions, so we only to need to get this once.
    Some region names are repeated because there are multiple points within that region,
        so numbers are appended to the region names to distinguish them.
    
    returns a list of strings
    """
    regions_path = "data/ADHD200_CC200/KKI_1018959_region_names_full_file.txt"
    regions = []
    with open(regions_path, "r") as f:
        regions = [region.strip().replace(" ", "_") for region in f]
    names = defaultdict(int)
    distinct_region_names = []
    for region in regions:
        distinct_region_names.append(region+"_"+str(names[region]))
        names[region] += 1
    return distinct_region_names

In [30]:
region_names = get_regions()
conn_matrices = get_conn_matrices()

## Calculate graph statistics

In [None]:
# if the measures have already been calculated and saved, just load them
load_measures = False
if load_measures:
    with open("conn_mat_measures.pkl", "rb") as f:
        measures = pkl.load(f)

In [None]:
def binarize(g, thresh):
    """
    Turn a weighted graph into an unweighted graph with the specified threshold.
    All edges less than the threshold become 0, while those above the threshold become 1.
    """
    return (g > thresh).astype(int)

In [None]:
# binarize each matrix
subjects = list(conn_matrices.keys())
binarized = OrderedDict()
for idx, subject in enumerate(subjects):
    cm = conn_matrices[subject]
    uw = binarize(cm, .2)
    binarized[subject] = uw
    
del conn_matrices

In [None]:
def measure_graphs(binarized):
    """
    Calculate predetermined graph analytics for each graph and return then in a dict of lists.
    measures: average clustering coefficient, global efficiencies, average shortest path length
    
    Small-worldness (sigma or omega) is too computationally expensive to calculate, but it is a function of
        the average shortest path length and the average clustering coefficient, so small world qualities are
        taken into account.
    """
    measures = {}
    
    print("calculating average clustering coefficients")
    # clustering coefficient is a measure of how frequently nodes in a graph tend to cluster together
    measures["average_clustering_coef"] = [nx.average_clustering(nx.Graph(bu)) for bu in binarized.values()]
    
    print("calculating global efficiencies")
    measures["global_efficiency"] = [nx.global_efficiency(nx.Graph(bu)) for bu in binarized.values()]
    
    print("calculating average shortest path lengths")
    measures["average_shortest_path_length"] = [nx.average_shortest_path_length(nx.Graph(bu)) for bu in binarized.values()]
    
    return measures

In [None]:
# get measures for each of the connectivity matrices
measures = measure_graphs(binarized)

In [None]:
save_measures = False
if save_measures:
    with open("conn_mat_measures.pkl", "wb") as f:
        pkl.dump(measures, f)

In [None]:
# free up some memory
del binarized

## load subject data

In [31]:
## Get subject data
with open("cm_table.html", "r") as f:
    table = f.read()

soup = BeautifulSoup(table, "html.parser")

rows = soup.find_all(class_="powerTable")[1].tbody.find_all("tr")[3:523]

cols = defaultdict(list)
for row in rows:
    text_list = list(row.stripped_strings)
    if len(text_list) == 13:
        text_list.insert(7, 'na') # insert so list is standard size when that column was empty on the webpage
    cols["study"].append(text_list[2])
    cols["id"].append(text_list[3].replace("_", ""))
    cols["age"].append(float(text_list[8]))
    cols["gender"].append(text_list[10])
    cols["label"].append(text_list[11])

cols["gender"] = [1 if gen == "Male" else 0 for gen in cols["gender"]] # one hot label the gender
subject_data = pd.DataFrame(cols)

del soup
del rows
del cols

In [32]:
def sort_subject_data(subject_data, subject_order):
    """
    Sorts a dataframe by the order given
    :arg subject_data: dataframe with ADHD200 subject data
    :arg subject_order: a list of subjects in specific order
    """
    subject_data_ids = subject_data["id"].values
    subjects_order_in_subject_data = [np.where(subject_data_ids==subject)[0][0] for subject in subject_order]
    subject_data_sort = subject_data.iloc[subjects_order_in_subject_data, :]
    return subject_data_sort

In [33]:
subject_order = list(conn_matrices.keys())
subject_data_sorted = sort_subject_data(subject_data, subject_order)
adhd = [0 if label == "Typically Developing" else 1 for label in subject_data_sorted["label"]]
subject_data_sorted = subject_data_sorted.assign(adhd=adhd).drop(columns="label")

del subject_data
del conn_matrices

In [34]:
# measures data is in same order as the subject data. Combine them.
subject_data_measures = pd.concat([subject_data_sorted, pd.DataFrame(measures)], axis=1)
del subject_data_sorted

## Modeling

In [39]:
subject_data_measures.columns

Index(['study', 'id', 'age', 'gender', 'adhd', 'average_clustering_coef',
       'global_efficiency', 'average_shortest_path_length'],
      dtype='object')

In [50]:
X = subject_data_measures.drop(columns=["study", "id", "adhd"]).values
y = subject_data_measures["adhd"].values

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2)

In [52]:
scale = True
if scale:
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

In [54]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
roc_auc = roc_auc_score(y_test, preds)

print("train acc = {:.3f}".format(model.score(X_train, y_train)))
print("test acc = {:.3f}".format(model.score(X_test, y_test)))
print("ROC AUC = {:.3f}".format(roc_auc))
print(confusion_matrix(y_test, preds))

train acc = 0.654
test acc = 0.558
ROC AUC = 0.499
[[47 20]
 [26 11]]


In [55]:
model = RandomForestClassifier(max_depth=4)
model.fit(X_train, y_train)
preds = model.predict(X_test)
roc_auc = roc_auc_score(y_test, preds)

print("train acc = {:.3f}".format(model.score(X_train, y_train)))
print("test acc = {:.3f}".format(model.score(X_test, y_test)))
print("ROC AUC = {:.3f}".format(roc_auc))
print(confusion_matrix(y_test, preds))

train acc = 0.750
test acc = 0.615
ROC AUC = 0.526
[[56 11]
 [29  8]]
